1- ### WIP
2-
31import sys
42import os
5- import getopt
63import csv
4+ import argparse
75
86"""
9- Splits a CSV file into multiple pieces based on command line arguments.
7+
8+ Splits a CSV file into multiple pieces based on command line arguments.
109
1110 Arguments:
12- `-h`: help file of usage of the script
13- `-i `: input file name
14- `-o `: output file, A %s-style template for the numbered output files.
15- `-r `: row limit to split
16- `-c `: A %s-style template for the numbered output files.
11+
12+ `-h `: help file of usage of the script
13+ `-i `: input file name
14+ `-o `: output file name
15+ `-r `: row limit to split
1716
1817 Default settings:
19- `output_path` is the current directory
20- `keep_headers` is on (headers will be kept)
21- `delimeter` is ,
18+
19+ `output_path` is the current directory
20+ headers are displayed on each split file
21+ the default delimeter is a comma
2222
2323 Example usage:
24- # split by every 10000 rows
25- >> python 12_csv_split.py -i input.csv -o rownumber -r 10000
26- # split by unique items in column 0
27- >> python 12_csv_split.py -i input.csv -o userid -c 0
28- # access help
29- >> python 12_csv_split.py -h for help
30-
24+
25+ ```
26+ # split csv by every 100 rows
27+ >> python csv_split.py -i input.csv -o output -r 100
28+ ```
29+
3130"""
3231
33- def main (argv ):
34-
35- argument_dict = grab_command_line_arguments (argv )
36- parse_file (argument_dict )
37-
38-
39- def grab_command_line_arguments (argv ):
40-
41- # global variables
42- inputfile = ''
43- outputfile = ''
44- rowlimit = ''
45- columnindex = ''
46- argument_dict = {}
47-
48- # grab arguments
49- opts , args = getopt .getopt (argv ,"hi:o:r:c:" ,["ifile=" ,"ofile=" ,"rowlimit=" ,"columnindex=" ])
50-
51- # end if no arguments provided
52- if not opts :
53- print "No options provided. Try again. Use `-h` for help."
54- sys .exit ()
55-
56- # grab arguments
57- for opt , arg in opts :
58- if opt == '-h' :
59- print 'csvsplit.py -i <inputfile> -r <row limit> -c <column index> -o <outputfile>'
60- sys .exit ()
61- elif opt in ("-i" , "--ifile" ):
62- inputfile = arg
63- elif opt in ("-o" , "--ofile" ):
64- outputfile = arg
65- elif opt in ("-r" , "--rowlimit" ):
66- rowlimit = arg
67- elif opt in ("-c" , "--columnindex" ):
68- columnindex = arg
69-
70- # Output arguments
71<
4BEE
span class="diff-text-marker">- print "\n Arguments:"
72- if inputfile :
73- argument_dict ["input_file" ] = inputfile
74- print "Input file is '{}'" .format (inputfile )
75- else :
76- "Please enter an input file."
77- if outputfile :
78- argument_dict ["output_file" ] = outputfile
79- print "Output file is '{}'" .format (outputfile )
80- else :
81- print "Please enter an output file."
82- if rowlimit :
83- argument_dict ["rowlimit" ] = rowlimit
84- print "Rowlimit is '{}'" .format (rowlimit )
85- if columnindex :
86- argument_dict ["columnindex" ] = columnindex
87- print "Columnindex is '{}'" .format (columnindex )
88- if rowlimit and columnindex :
89- print "Please use either a rowlimit or columnlimit, not both."
90- sys .exit ()
91- if not rowlimit or columnindex :
92- print "Please enter either a rowlimit or columnlimit."
93- sys .exit ()
94-
95- # to do - check to make sure file, rowlimit, and columnlimit exist
96- print argument_dict
97- return argument_dict
98-
99-
100- def parse_file (argument_dict ):
101-
102- #split csv file by certain rownumber
103- if argument_dict ["rowlimit" ]:
104- rowlimit = int (argument_dict ["rowlimit" ])
105- output_name_file = "{}.csv" .format (argument_dict ["output_file" ])
106- output_path = '.'
107- keep_headers = True
108- delimiter = ','
109- filehandler = open (argument_dict ["input_file" ],'r' )
110- reader = csv .reader (filehandler , delimiter = delimiter )
111- current_piece = 1
112- current_out_path = os .path .join (
113- output_path ,
114- output_name_file
32+
33+ def get_arguments ():
34+ """Grab user supplied arguments using the argparse library."""
35+
36+ # Use arparse to get command line arguments
37+ parser = argparse .ArgumentParser ()
38+ parser .add_argument ("-i" , "--input_file" , required = True ,
39+ help = "csv input file (with extension)" , type = str )
40+ parser .add_argument ("-o" , "--output_file" , required = True ,
41+ help = "csv output file (without extension)" , type = str )
42+ parser .add_argument ("-r" , "--row_limit" , required = True ,
43+ help = "row limit to split csv at" , type = int )
44+ args = parser .parse_args ()
45+
46+ # Check if the input_file exits
47+ is_valid_file (parser , args .input_file )
48+
49+ # Check if the input_file is valid
50+ is_valid_csv (parser , args .input_file , args .row_limit )
51+
52+ return args .input_file , args .output_file , args .row_limit
53+
54+
55+ def is_valid_file (parser , file_name ):
56+ """Ensure that the input_file exists."""
57+ if not os .path .exists (file_name ):
58+ parser .error ("The file '{}' does not exist!" .format (file_name ))
59+ sys .exit (1 )
60+
61+
62+ def is_valid_csv (parser , file_name , row_limit ):
63+ """
64+ Ensure that the # of rows in the input_file
65+ is greater than the row_limit.
66+ """
67+ row_count = 0
68+ for row in csv .reader (open (file_name )):
69+ row_count += 1
70+ # Note: You could also use a generator expression
71+ # and the sum() function to count the rows:
72+ # row_count = sum(1 for row in csv.reader(open(file_name)))
73+ if row_limit > row_count :
74+ parser .error (
75+ "The 'row_count' of '{}' is > the number of rows in '{}'!"
76+ .format (row_limit , file_name )
11577 )
116- current_out_writer = csv .writer (open (current_out_path , 'w' ), delimiter = delimiter )
117- current_limit = rowlimit
118- if keep_headers :
119- headers = reader .next ()
120- current_out_writer .writerow (headers )
121- for i , row in enumerate (reader ):
122- if i + 1 > current_limit :
123- current_piece += 1
124- current_limit = rowlimit * current_piece
125- current_out_path = os .path .join (
126- output_path ,
127- output_name_file
128- )
129- current_out_writer = csv .writer (open (current_out_path , 'w' ), delimiter = delimiter )
130-
131- # elif columnindex: #split csv file accrording to unique values of certain column,it's like filter only certain item in excel
132- # itemlist = []
133- # columnindex = int(columnindex)
134- # output_name_template= outputfile+'_%s.csv'
135- # output_path='.'
136- # keep_headers=True
137- # delimiter=','
138- # filehandler = open(inputfile,'r')
139- # reader = csv.reader(filehandler, delimiter=delimiter)
140- # if keep_headers:
141- # headers = reader.next()
142-
143- # for i, row in enumerate(reader):
144-
145- # current_out_path = os.path.join(
146- # output_path,
147- # output_name_template % row[columnindex] )
148- # if row[columnindex] not in itemlist:
149- # try:
150- # current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
151- # except IOError:
152- # continue
153- # else:
154- # itemlist.append(row[columnindex])
155- # if keep_headers:
156- # current_out_writer.writerow(headers)
157- # current_out_writer.writerow(row)
158- # else:
159- # current_out_writer = csv.writer(open(current_out_path, 'a'), delimiter=delimiter)
160- # current_out_writer.writerow(row)
161- # print 'totally %i unique items in column %i \n' % (len(itemlist),columnindex)
162- # else:
163- # print "oops, please check instruction of script by >>./csvsplit.py -h"
78+ sys .exit (1 )
79+
80+
81+ def parse_file (arguments ):
82+ """
83+ Splits the CSV into multiple files or chunks based on the row_limit.
84+ Then create new CSV files.
85+ """
86+ input_file = arguments [0 ]
87+ output_file = arguments [1 ]
88+ row_limit = arguments [2 ]
89+ output_path = '.' # Current directory
90+
91+ # Read CSV, split into list of lists
92+ with open (input_file , 'r' ) as input_csv :
93+ datareader = csv .reader (input_csv )
94+ all_rows = []
95+ for row in datareader :
96+ all_rows .append (row )
97+
98+ # Remove header
99+ header = all_rows .pop (0 )
100+
101+ # Split list of list into chunks
102+ current_chunk = 0
103+ for i in range (0 , len (all_rows ), row_limit ): # Loop through list
104+ chunk = all_rows [i :i + row_limit ] # Create single chunk
105+
106+ current_output = os .path .join ( # Create new output file
107+ output_path ,
108+ "{}-{}.csv" .format (output_file , current_chunk )
109+ )
110+
111+ # Add header
112+ chunk .insert (0 , header )
113+
114+ # Write chunk to output file
115+ with open (current_output , 'w' ) as output_csv :
116+ writer = csv .writer (output_csv )
117+ writer = writer .writerows (chunk )
118+
119+ # Output info
120+ print ""
121+ print "Chunk # {}:" .format (current_chunk )
122+ print "Filepath: {}" .format (current_output )
123+ print "# of rows: {}" .format (len (chunk ))
124+
125+ # Create new chunk
126+ current_chunk += 1
164127
165128
166129if __name__ == "__main__" :
167- main (sys .argv [1 :])
130+ arguments = get_arguments ()
131+ parse_file (arguments )
0 commit comments