1
+ ### WIP
2
+
3
+ import sys
4
+ import os
5
+ import getopt
6
+ import csv
7
+
8
+ """
9
+ Splits a CSV file into multiple pieces based on command line arguments.
10
+
11
+ Arguments:
12
+ `-h`: help file of usage of the script
13
+ `-i`: input file name
14
+ `-o`: output file, A %s-style template for the numbered output files.
15
+ `-r`: row limit to split
16
+ `-c`: A %s-style template for the numbered output files.
17
+
18
+ Default settings:
19
+ `output_path` is the current directory
20
+ `keep_headers` is on (headers will be kept)
21
+ `delimeter` is ,
22
+
23
+ Example usage:
24
+ # split by every 10000 rows
25
+ >> python 12_csv_split.py -i input.csv -o rownumber -r 10000
26
+ # split by unique items in column 0
27
+ >> python 12_csv_split.py -i input.csv -o userid -c 0
28
+ # access help
29
+ >> python 12_csv_split.py -h for help
30
+
31
+ """
32
+
33
+ def main (argv ):
34
+
35
+ argument_dict = grab_command_line_arguments (argv )
36
+ parse_file (argument_dict )
37
+
38
+
39
+ def grab_command_line_arguments (argv ):
40
+
41
+ # global variables
42
+ inputfile = ''
43
+ outputfile = ''
44
+ rowlimit = ''
45
+ columnindex = ''
46
+ argument_dict = {}
47
+
48
+ # grab arguments
49
+ opts , args = getopt .getopt (argv ,"hi:o:r:c:" ,["ifile=" ,"ofile=" ,"rowlimit=" ,"columnindex=" ])
50
+
51
+ # end if no arguments provided
52
+ if not opts :
53
+ print "No options provided. Try again. Use `-h` for help."
54
+ sys .exit ()
55
+
56
+ # grab arguments
57
+ for opt , arg in opts :
58
+ if opt == '-h' :
59
+ print 'csvsplit.py -i <inputfile> -r <row limit> -c <column index> -o <outputfile>'
60
+
8000
sys .exit ()
61
+ elif opt in ("-i" , "--ifile" ):
62
+ inputfile = arg
63
+ elif opt in ("-o" , "--ofile" ):
64
+ outputfile = arg
65
+ elif opt in ("-r" , "--rowlimit" ):
66
+ rowlimit = arg
67
+ elif opt in ("-c" , "--columnindex" ):
68
+ columnindex = arg
69
+
70
+ # Output arguments
71
+ print "\n Arguments:"
72
+ if inputfile :
73
+ argument_dict ["input_file" ] = inputfile
74
+ print "Input file is '{}'" .format (inputfile )
75
+ else :
76
+ "Please enter an input file."
77
+ if outputfile :
78
+ argument_dict ["output_file" ] = outputfile
79
+ print "Output file is '{}'" .format (outputfile )
80
+ else :
81
+ print "Please enter an output file."
82
+ if rowlimit :
83
+ argument_dict ["rowlimit" ] = rowlimit
84
+ print "Rowlimit is '{}'" .format (rowlimit )
85
+ if columnindex :
86
+ argument_dict ["columnindex" ] = columnindex
87
+ print "Columnindex is '{}'" .format (columnindex )
88
+ if rowlimit and columnindex :
89
+ print "Please use either a rowlimit or columnlimit, not both."
90
+ sys .exit ()
91
+ if not rowlimit or columnindex :
92
+ print "Please enter either a rowlimit or columnlimit."
93
+ sys .exit ()
94
+
95
+ # to do - check to make sure file, rowlimit, and columnlimit exist
96
+ print argument_dict
97
+ return argument_dict
98
+
99
+
100
+ def parse_file (argument_dict ):
101
+
102
+ #split csv file by certain rownumber
103
+ if argument_dict ["rowlimit" ]:
104
+ rowlimit = int (argument_dict ["rowlimit" ])
105
+ output_name_file = "{}.csv" .format (argument_dict ["output_file" ])
106
+ output_path = '.'
107
+ keep_headers = True
108
+ delimiter = ','
109
+ filehandler = open (argument_dict ["input_file" ],'r' )
110
+ reader = csv .reader (filehandler , delimiter = delimiter )
111
+ current_piece = 1
112
+ current_out_path = os .path .join (
113
+ output_path ,
114
+ output_name_file
115
+ )
116
+ current_out_writer = csv .writer (open (current_out_path , 'w' ), delimiter = delimiter )
117
+ current_limit = rowlimit
118
+ if keep_headers :
119
+ headers = reader .next ()
120
+ current_out_writer .writerow (headers )
121
+ for i , row in enumerate (reader ):
122
+ if i + 1 > current_limit :
123
+ current_piece += 1
124
+ current_limit = rowlimit * current_piece
125
+ current_out_path = os .path .join (
126
+ output_path ,
127
+ output_name_file
128
+ )
129
+ current_out_writer = csv .writer (open (current_out_path , 'w' ), delimiter = delimiter )
130
+
131
+ # elif columnindex: #split csv file accrording to unique values of certain column,it's like filter only certain item in excel
132
+ # itemlist = []
133
+ # columnindex = int(columnindex)
134
+ # output_name_template= outputfile+'_%s.csv'
135
+ # output_path='.'
136
+ # keep_headers=True
137
+ # delimiter=','
138
+ # filehandler = open(inputfile,'r')
139
+ # reader = csv.reader(filehandler, delimiter=delimiter)
140
+ # if keep_headers:
141
+ # headers = reader.next()
142
+
143
+ # for i, row in enumerate(reader):
144
+
145
+ # current_out_path = os.path.join(
146
+ # output_path,
147
+ # output_name_template % row[columnindex] )
148
+ # if row[columnindex] not in itemlist:
149
+ # try:
150
+ # current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
151
+ # except IOError:
152
+ # continue
153
+ # else:
154
+ # itemlist.append(row[columnindex])
155
+ # if keep_headers:
156
+ # current_out_writer.writerow(headers)
157
+ # current_out_writer.writerow(row)
158
+ # else:
159
+ # current_out_writer = csv.writer(open(current_out_path, 'a'), delimiter=delimiter)
160
+ # current_out_writer.writerow(row)
161
+ # print 'totally %i unique items in column %i \n' % (len(itemlist),columnindex)
162
+ # else:
163
+ # print "oops, please check instruction of script by >>./csvsplit.py -h"
164
+
165
+
166
+ if __name__ == "__main__" :
167
+ main (sys .argv [1 :])
0 commit comments