8000 How to merge multiple CSV files with Python · tekrajchhetri/python@1e5744d · GitHub
[go: up one dir, main page]

Skip to content
8000

Commit 1e5744d

Browse files
committed
How to merge multiple CSV files with Python
1 parent a2c8a0e commit 1e5744d

File tree

5 files changed

+354
-0
lines changed

5 files changed

+354
-0
lines changed

notebooks/csv/data_201901.csv

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
col1,col2,col3
2+
A,B,1
3+
AA,BB,2

notebooks/csv/data_201902.csv

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
col1,col2,col3
2+
C,D,3
3+
CC,DD,4

notebooks/csv/data_202001.csv

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
col1,col2,col3,col4
2+
E,F,5,e5
3+
EE,FF,6,ee6

notebooks/csv/data_202002.csv

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
col1,col2,col3,col5
2+
H,J,7,77
3+
HH,JJ,8,88
Lines changed: 342 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,342 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# How to merge multiple CSV files with Python\n",
8+
"Python convert normal JSON to JSON separated lines 3 examples\n",
9+
"\n",
10+
"* Steps to merge multiple CSV(identical) files with Python\n",
11+
&quo 57A6 t;* Steps to merge multiple CSV(identical) files with Python with trace\n",
12+
"* Combine multiple CSV files when the columns are different\n",
13+
"* Bonus: Merge multiple files with Windows/Linux"
14+
]
15+
},
16+
{
17+
"cell_type": "markdown",
18+
"metadata": {},
19+
"source": [
20+
"## 1. Steps to merge multiple CSV(identical) files with Python"
21+
]
22+
},
23+
{
24+
"cell_type": "code",
25+
"execution_count": 1,
26+
"metadata": {},
27+
"outputs": [],
28+
"source": [
29+
"import os, glob\n",
30+
"import pandas as pd\n",
31+
"\n",
32+
"path = \"../../csv/\"\n",
33+
"\n",
34+
"all_files = glob.glob(os.path.join(path, \"data_2019*.csv\"))\n",
35+
"\n",
36+
"all_csv = (pd.read_csv(f, sep=',') for f in all_files)\n",
37+
"df_merged = pd.concat(all_csv, ignore_index=True)\n",
38+
"df_merged.to_csv( \"merged.csv\")"
39+
]
40+
},
41+
{
42+
"cell_type": "markdown",
43+
"metadata": {},
44+
"source": [
45+
"## 2. Steps to merge multiple CSV(identical) files with Python with trace"
46+
]
47+
},
48+
{
49+
"cell_type": "code",
50+
"execution_count": 2,
51+
"metadata": {},
52+
"outputs": [
53+
{
54+
"data": {
55+
"text/html": [
56+
"<div>\n",
57+
"<style scoped>\n",
58+
" .dataframe tbody tr th:only-of-type {\n",
59+
" vertical-align: middle;\n",
60+
" }\n",
61+
"\n",
62+
" .dataframe tbody tr th {\n",
63+
" vertical-align: top;\n",
64+
" }\n",
65+
"\n",
66+
" .dataframe thead th {\n",
67+
" text-align: right;\n",
68+
" }\n",
69+
"</style>\n",
70+
"<table border=\"1\" class=\"dataframe\">\n",
71+
" <thead>\n",
72+
" <tr style=\"text-align: right;\">\n",
73+
" <th></th>\n",
74+
" <th>col1</th>\n",
75+
" <th>col2</th>\n",
76+
" <th>col3</th>\n",
77+
" <th>file</th>\n",
78+
" </tr>\n",
79+
" </thead>\n",
80+
" <tbody>\n",
81+
" <tr>\n",
82+
" <th>0</th>\n",
83+
" <td>C</td>\n",
84+
" <td>D</td>\n",
85+
" <td>3</td>\n",
86+
" <td>data_201902.csv</td>\n",
87+
" </tr>\n",
88+
" <tr>\n",
89+
" <th>1</th>\n",
90+
" <td>CC</td>\n",
91+
" <td>DD</td>\n",
92+
" <td>4</td>\n",
93+
" <td>data_201902.csv</td>\n",
94+
" </tr>\n",
95+
" <tr>\n",
96+
" <th>2</th>\n",
97+
" <td>A</td>\n",
98+
" <td>B</td>\n",
99+
" <td>1</td>\n",
100+
" <td>data_201901.csv</td>\n",
101+
" </tr>\n",
102+
" <tr>\n",
103+
" <th>3</th>\n",
104+
" <td>AA</td>\n",
105+
" <td>BB</td>\n",
106+
" <td>2</td>\n",
107+
" <td>data_201901.csv</td>\n",
108+
" </tr>\n",
109+
" </tbody>\n",
110+
"</table>\n",
111+
"</div>"
112+
],
113+
"text/plain": [
114+
" col1 col2 col3 file\n",
115+
"0 C D 3 data_201902.csv\n",
116+
"1 CC DD 4 data_201902.csv\n",
117+
"2 A B 1 data_201901.csv\n",
118+
"3 AA BB 2 data_201901.csv"
119+
]
120+
},
121+
"execution_count": 2,
122+
"metadata": {},
123+
"output_type": "execute_result"
124+
}
125+
],
126+
"source": [
127+
"import os, glob\n",
128+
"import pandas as pd\n",
129+
"\n",
130+
"path = \"../../csv/\"\n",
131+
"\n",
132+
"all_files = glob.glob(os.path.join(path, \"data_2019*.csv\"))\n",
133+
"\n",
134+
"all_df = []\n",
135+
"for f in all_files:\n",
136+
" df = pd.read_csv(f, sep=',')\n",
137+
" df['file'] = f.split('/')[-1]\n",
138+
" all_df.append(df)\n",
139+
" \n",
140+
"merged_df = pd.concat(all_df, ignore_index=True)\n",
141+
"merged_df"
142+
]
143+
},
144+
{
145+
"cell_type": "markdown",
146+
"metadata": {},
147+
"source": [
148+
"## 3. Combine multiple CSV files when the columns are different"
149+
]
150+
},
151+
{
152+
"cell_type": "code",
153+
"execution_count": 7,
154+
"metadata": {},
155+
"outputs": [
156+
{
157+
"data": {
158+
"text/html": [
159+
"<div>\n",
160+
"<style scoped>\n",
161+
" .dataframe tbody tr th:only-of-type {\n",
162+
" vertical-align: middle;\n",
163+
" }\n",
164+
"\n",
165+
" .dataframe tbody tr th {\n",
166+
" vertical-align: top;\n",
167+
" }\n",
168+
"\n",
169+
" .dataframe thead th {\n",
170+
" text-align: right;\n",
171+
" }\n",
172+
"</style>\n",
173+
"<table border=\"1\" class=\"dataframe\">\n",
174+
" <thead>\n",
175+
" <tr style=\"text-align: right;\">\n",
176+
" <th></th>\n",
177+
" <th>col1</th>\n",
178+
" <th>col2</th>\n",
179+
" <th>col3</th>\n",
180+
" <th>col4</th>\n",
181+
" <th>col5</th>\n",
182+
" <th>file</th>\n",
183+
" </tr>\n",
184+
" </thead>\n",
185+
" <tbody>\n",
186+
" <tr>\n",
187+
" <th>0</th>\n",
188+
" <td>E</td>\n",
189+
" <td>F</td>\n",
190+
" <td>5</td>\n",
191+
" <td>e5</td>\n",
192+
" <td>NaN</td>\n",
193+
" <td>data_202001.csv</td>\n",
194+
" </tr>\n",
195+
" <tr>\n",
196+
" <th>1</th>\n",
197+
" <td>EE</td>\n",
198+
" <td>FF</td>\n",
199+
" <td>6</td>\n",
200+
" <td>ee6</td>\n",
201+
" <td>NaN</td>\n",
202+
" <td>data_202001.csv</td>\n",
203+
" </tr>\n",
204+
" <tr>\n",
205+
" <th>2</th>\n",
206+
" <td>H</td>\n",
207+
" <td>J</td>\n",
208+
" <td>7</td>\n",
209+
" <td>NaN</td>\n",
210+
" <td>77.0</td>\n",
211+
" <td>data_202002.csv</td>\n",
212+
" </tr>\n",
213+
" <tr>\n",
214+
" <th>3</th>\n",
215+
" <td>HH</td>\n",
216+
" <td>JJ</td>\n",
217+
" <td>8</td>\n",
218+
" <td>NaN</td>\n",
219+
" <td>88.0</td>\n",
220+
" <td>data_202002.csv</td>\n",
221+
" </tr>\n",
222+
" <tr>\n",
223+
" <th>4</th>\n",
224+
" <td>C</td>\n",
225+
" <td>D</td>\n",
226+
" <td>3</td>\n",
227+
" <td>NaN</td>\n",
228+
" <td>NaN</td>\n",
229+
" <td>data_201902.csv</td>\n",
230+
" </tr>\n",
231+
" <tr>\n",
232+
" <th>5</th>\n",
233+
" <td>CC</td>\n",
234+
" <td>DD</td>\n",
235+
" <td>4</td>\n",
236+
" <td>NaN</td>\n",
237+
" <td>NaN</td>\n",
238+
" <td>data_201902.csv</td>\n",
239+
" </tr>\n",
240+
" <tr>\n",
241+
" <th>6</th>\n",
242+
" <td>A</td>\n",
243+
" <td>B</td>\n",
244+
" <td>1</td>\n",
245+
" <td>NaN</td>\n",
246+
" <td>NaN</td>\n",
247+
" <td>data_201901.csv</td>\n",
248+
" </tr>\n",
249+
" <tr>\n",
250+
" <th>7</th>\n",
251+
" <td>AA</td>\n",
252+
" <td>BB</td>\n",
253+
" <td>2</td>\n",
254+
" <td>NaN</td>\n",
255+
" <td>NaN</td>\n",
256+
" <td>data_201901.csv</td>\n",
257+
" </tr>\n",
258+
" </tbody>\n",
259+
"</table>\n",
260+
"</div>"
261+
],
262+
"text/plain": [
263+
" col1 col2 col3 col4 col5 file\n",
264+
"0 E F 5 e5 NaN data_202001.csv\n",
265+
"1 EE FF 6 ee6 NaN data_202001.csv\n",
266+
"2 H J 7 NaN 77.0 data_202002.csv\n",
267+
"3 HH JJ 8 NaN 88.0 data_202002.csv\n",
268+
"4 C D 3 NaN NaN data_201902.csv\n",
269+
"5 CC DD 4 NaN NaN data_201902.csv\n",
270+
"6 A B 1 NaN NaN data_201901.csv\n",
271+
"7 AA BB 2 NaN NaN data_201901.csv"
272+
]
273+
},
274+
"execution_count": 7,
275+
"metadata": {},
276+
"output_type": "execute_result"
277+
}
278+
],
279+
"source": [
280+
"import os, glob\n",
281+
"import pandas as pd\n",
282+
"\n",
283+
"path = \"../../csv/\"\n",
284+
"\n",
285+
"all_files = glob.glob(os.path.join(path, \"data_*.csv\"))\n",
286+
"\n",
287+
"\n",
288+
"all_df = []\n",
289+
"for f in all_files:\n",
290+
" df = pd.read_csv(f, sep=',')\n",
291+
" df['file'] = f.split('/')[-1]\n",
292+
" all_df.append(df)\n",
293+
" \n",
294+
"merged_df = pd.concat(all_df, ignore_index=True, sort=True)\n",
295+
"merged_df"
296+
]
297+
},
298+
{
299+
"cell_type": "markdown",
300+
"metadata": {},
301+
"source": [
302+
"## 4. Bonus: Merge multiple files with Windows/Linux\n",
303+
"\n",
304+
"Linux\n",
305+
"\n",
306+
"`sed 1d data_*.csv > merged.csv`\n",
307+
"\n",
308+
"Windows\n",
309+
"\n",
310+
"`C:\\> copy data_*.csv merged.csv `"
311+
]
312+
},
313+
{
314+
"cell_type": "code",
315+
"execution_count": null,
316+
"metadata": {},
317+
"outputs": [],
318+
"source": []
319+
}
320+
],
321+
"metadata": {
322+
"kernelspec": {
323+
"display_name": "Python 3",
324+
"language": "python",
325+
"name": "python3"
326+
},
327+
"language_info": {
328+
"codemirror_mode": {
329+
"name": "ipython",
330+
"version": 3
331+
},
332+
"file_extension": ".py",
333+
"mimetype": "text/x-python",
334+
"name": "python",
335+
"nbconvert_exporter": "python",
336+
"pygments_lexer": "ipython3",
337+
"version": "3.6.9"
338+
}
339+
},
340+
"nbformat": 4,
341+
"nbformat_minor": 2
342+
}

0 commit comments

Comments
 (0)
0