1
- ### WIP
2
-
3
1
import sys
4
2
import os
5
- import getopt
6
3
import csv
4
+ import argparse
7
5
8
6
"""
9
- Splits a CSV file into multiple pieces based on command line arguments.
7
+
8
+ Splits a CSV file into multiple pieces based on command line arguments.
10
9
8000
11
10
Arguments:
12
- `-h`: help file of usage of the script
13
- `-i `: input file name
14
- `-o `: output file, A %s-style template for the numbered output files.
15
- `-r `: row limit to split
16
- `-c `: A %s-style template for the numbered output files.
11
+
12
+ `-h `: help file of usage of the script
13
+ `-i `: input file name
14
+ `-o `: output file name
15
+ `-r `: row limit to split
17
16
18
17
Default settings:
19
- `output_path` is the current directory
20
- `keep_headers` is on (headers will be kept)
21
- `delimeter` is ,
18
+
19
+ `output_path` is the current directory
20
+ headers are displayed on each split file
21
+ the default delimeter is a comma
22
22
23
23
Example usage:
24
- # split by every 10000 rows
25
- >> python 12_csv_split.py -i input.csv -o rownumber -r 10000
26
- # split by unique items in column 0
27
- >> python 12_csv_split.py -i input.csv -o userid -c 0
28
- # access help
29
- >> python 12_csv_split.py -h for help
30
-
24
+
25
+ ```
26
+ # split csv by every 100 rows
27
+ >> python csv_split.py -i input.csv -o output -r 100
28
+ ```
29
+
31
30
"""
32
31
33
- def main (argv ):
34
-
35
- argument_dict = grab_command_line_arguments (argv )
36
- parse_file (argument_dict )
37
-
38
-
39
- def grab_command_line_arguments (argv ):
40
-
41
- # global variables
42
- inputfile = ''
43
- outputfile = ''
44
- rowlimit = ''
45
- columnindex = ''
46
- argument_dict = {}
47
-
48
- # grab arguments
49
- opts , args = getopt .getopt (argv ,"hi:o:r:c:" ,["ifile=" ,"ofile=" ,"rowlimit=" ,"columnindex=" ])
50
-
51
- # end if no arguments provided
52
- if not opts :
53
- print "No options provided. Try again. Use `-h` for help."
54
- sys .exit ()
55
-
56
- # grab arguments
57
- for opt , arg in opts :
58
- if opt == '-h' :
59
- print 'csvsplit.py -i <inputfile> -r <row limit> -c <column index> -o <outputfile>'
60
- sys .exit ()
61
- elif opt in ("-i" , "--ifile" ):
62
- inputfile = arg
63
- elif opt in ("-o" , "--ofile" ):
64
- outputfile = arg
65
- elif opt in ("-r" , "--rowlimit" ):
66
- rowlimit = arg
67
- elif opt in ("-c" , "--columnindex" ):
68
- columnindex = arg
69
-
70
- # Output arguments
71
- print "\n Arguments:"
72
- if inputfile :
73
- argument_dict ["input_file" ] = inputfile
74
- print "Input file is '{}'" .format (inputfile )
75
- else :
76
- "Please enter an input file."
77
- if outputfile :
78
- argument_dict ["output_file" ] = outputfile
79
- print "Output file is '{}'" .format (outputfile )
80
- else :
81
- print "Please enter an output file."
82
- if rowlimit :
83
- argument_dict ["rowlimit" ] = rowlimit
84
- print "Rowlimit is '{}'" .format (rowlimit )
85
- if columnindex :
86
- argument_dict ["columnindex" ] = columnindex
87
- print "Columnindex is '{}'" .format (columnindex )
88
- if rowlimit and columnindex :
89
- print "Please use either a rowlimit or columnlimit, not both."
90
- sys .exit ()
91
- if not rowlimit or columnindex :
92
- print "Please enter either a rowlimit or columnlimit."
93
- sys .exit ()
94
-
95
- # to do - check to make sure file, rowlimit, and columnlimit exist
96
- print argument_dict
97
- return argument_dict
98
-
99
-
100
- def parse_file (argument_dict ):
101
-
102
- #split csv file by certain rownumber
103
- if argument_dict ["rowlimit" ]:
104
- rowlimit = int (argument_dict ["rowlimit" ])
105
- output_name_file = "{}.csv" .format (argument_dict ["output_file" ])
106
- output_path = '.'
107
- keep_headers = True
108
- delimiter = ','
109
- filehandler = open (argument_dict ["input_file" ],'r' )
110
- reader = csv .reader (filehandler , delimiter = delimiter )
111
- current_piece = 1
112
- current_out_path = os .path .join (
113
- output_path ,
114
- output_name_file
32
+
33
+ def get_arguments ():
34
+ """Grab user supplied arguments using the argparse library."""
35
+
36
+ # Use arparse to get command line arguments
37
+ parser = argparse .ArgumentParser ()
38
+ parser .add_argument ("-i" , "--input_file" , required = True ,
39
+ help = "csv input file (with extension)" , type = str )
40
+ parser .add_argument ("-o" , "--output_file" , required = True ,
41
+ help = "csv output file (without extension)" , type = str )
42
+ parser .add_argument ("-r" , "--row_limit" , required = True ,
43
+ help = "row limit to split csv at" , type = int )
44
+ args = parser .parse_args ()
45
+
46
+ # Check if the input_file exits
47
+ is_valid_file (parser , args .input_file )
48
+
49
+ # Check if the input_file is valid
50
+ is_valid_csv (parser , args .input_file , args .row_limit )
51
+
52
+ return args .input_file , args .output_file , args .row_limit
53
+
54
+
55
+ def is_valid_file (parser , file_name ):
56
+ """Ensure that the input_file exists."""
57
+ if not os .path .exists (file_name ):
58
+ parser .error ("The file '{}' does not exist!" .format (file_name ))
59
+ sys .exit (1 )
60
+
61
+
62
+ def is_valid_csv (parser , file_name , row_limit ):
63
+ """
64
+ Ensure that the # of rows in the input_file
65
+ is greater than the row_limit.
66
+ """
67
+ row_count = 0
68
+ for row in csv .reader (open (file_name )):
69
+ row_count += 1
70
+ # Note: You could also use a generator expression
71
+ # and the sum() function to count the rows:
72
+ # row_count = sum(1 for row in csv.reader(open(file_name)))
73
+ if row_limit > row_count :
74
+ parser .error (
75
+ "The 'row_count' of '{}' is > the number of rows in '{}'!"
76
+ .format (row_limit , file_name )
115
77
)
116
- current_out_writer = csv .writer (open (current_out_path , 'w' ), delimiter = delimiter )
117
- current_limit = rowlimit
118
- if keep_headers :
119
- headers = reader .next ()
120
- current_out_writer .writerow (headers )
121
- for i , row in enumerate (reader ):
122
- if i + 1 > current_limit :
123
- current_piece += 1
124
- current_limit = rowlimit * current_piece
125
- current_out_path = os .path .join (
126
- output_path ,
127
- output_name_file
128
- )
129
- current_out_writer = csv .writer (open (current_out_path , 'w' ), delimiter = delimiter )
130
-
131
- # elif columnindex: #split csv file accrording to unique values of certain column,it's like filter only certain item in excel
132
- # itemlist = []
133
- # columnindex = int(columnindex)
134
- # output_name_template= outputfile+'_%s.csv'
135
- # output_path='.'
136
- # keep_headers=True
137
- # delimiter=','
138
- # filehandler = open(inputfile,'r')
139
- # reader = csv.reader(filehandler, delimiter=delimiter)
140
- # if keep_headers:
141
- # headers = reader.next()
142
-
143
- # for i, row in enumerate(reader):
144
-
145
- # current_out_path = os.path.join(
146
- # output_path,
147
- # output_name_template % row[columnindex] )
148
- # if row[columnindex] not in itemlist:
149
- # try:
150
- # current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
151
- # except IOError:
152
- # continue
153
- # else:
154
- # itemlist.append(row[columnindex])
155
- # if keep_headers:
156
- # current_out_writer.writerow(headers)
157
- # current_out_writer.writerow(row)
158
- # else:
159
- # current_out_writer = csv.writer(open(current_out_path, 'a'), delimiter=delimiter)
160
- # current_out_writer.writerow(row)
161
- # print 'totally %i unique items in column %i \n' % (len(itemlist),columnindex)
162
- # else:
163
- # print "oops, please check instruction of script by >>./csvsplit.py -h"
78
+ sys .exit (1 )
79
+
80
+
81
+ def parse_file (arguments ):
82
+ """
83
+ Splits the CSV into multiple files or chunks based on the row_limit.
84
+ Then create new CSV files.
85
+ """
86
+ input_file = arguments [0 ]
87
+ output_file = arguments [1 ]
88
+ row_limit = arguments [2 ]
89
+ output_path = '.' # Current directory
90
+
91
+ # Read CSV, split into list of lists
92
+ with open (input_file , 'r' ) as input_csv :
93
+ datareader = csv .reader (input_csv )
94
+ all_rows = []
95
+ for row in datareader :
96
+ all_rows .append (row )
97
+
98
+ # Remove header
99
+ header = all_rows .pop (0 )
100
+
101
+ # Split list of list into chunks
102
+ current_chunk = 0
103
+ for i in range (0 , len (all_rows ), row_limit ): # Loop through list
104
+ chunk = all_rows [i :i + row_limit ] # Create single chunk
105
+
106
+ current_output = os .path .join ( # Create new output file
107
+ output_path ,
108
+ "{}-{}.csv" .format (output_file , current_chunk )
109
+ )
110
+
111
+ # Add header
112
+ chunk .insert (0 , header )
113
+
114
+ # Write chunk to output file
115
+ with open (current_output , 'w' ) as output_csv :
116
+ writer = csv .writer (output_csv )
117
+ writer = writer .writerows (chunk )
118
+
119
+ # Output info
120
+ print ""
121
+ print "Chunk # {}:" .format (current_chunk )
122
+ print "Filepath: {}" .format (current_output )
123
+ print "# of rows: {}" .format (len (chunk ))
124
+
125
+ # Create new chunk
126
+ current_chunk += 1
164
127
165
128
166
129
if __name__ == "__main__" :
167
- main (sys .argv [1 :])
130
+ arguments = get_arguments ()
131
+ parse_file (arguments )
0 commit comments