8000 added csv_split · ThinkCode/python-scripts@9828e28 · GitHub
[go: up one dir, main page]

Skip to content

Commit 9828e28

Browse files
committed
added csv_split
1 parent 8b65a7c commit 9828e28

File tree

1 file changed

+114
-150
lines changed

1 file changed

+114
-150
lines changed

12_csv_split.py

+114-150
8000
Original file line numberDiff line numberDiff line change
@@ -1,167 +1,131 @@
1-
### WIP
2-
31
import sys
42
import os
5-
import getopt
63
import csv
4+
import argparse
75

86
"""
9-
Splits a CSV file into multiple pieces based on command line arguments.
7+
8+
Splits a CSV file into multiple pieces based on command line arguments.
109
1110
Arguments:
12-
`-h`: help file of usage of the script
13-
`-i`: input file name
14-
`-o`: output file, A %s-style template for the numbered output files.
15-
`-r`: row limit to split
16-
`-c`: A %s-style template for the numbered output files.
11+
12+
`-h`: help file of usage of the script
13+
`-i`: input file name
14+
`-o`: output file name
15+
`-r`: row limit to split
1716
1817
Default settings:
19-
`output_path` is the current directory
20-
`keep_headers` is on (headers will be kept)
21-
`delimeter` is ,
18+
19+
`output_path` is the current directory
20+
headers are displayed on each split file
21+
the default delimeter is a comma
2222
2323
Example usage:
24-
# split by every 10000 rows
25-
>> python 12_csv_split.py -i input.csv -o rownumber -r 10000
26-
# split by unique items in column 0
27-
>> python 12_csv_split.py -i input.csv -o userid -c 0
28-
# access help
29-
>> python 12_csv_split.py -h for help
30-
24+
25+
```
26+
# split csv by every 100 rows
27+
>> python csv_split.py -i input.csv -o output -r 100
28+
```
29+
3130
"""
3231

33-
def main(argv):
34-
35-
argument_dict = grab_command_line_arguments(argv)
36-
parse_file(argument_dict)
37-
38-
39-
def grab_command_line_arguments(argv):
40-
41-
# global variables
42-
inputfile = ''
43-
outputfile = ''
44-
rowlimit = ''
45-
columnindex = ''
46-
argument_dict = {}
47-
48-
# grab arguments
49-
opts, args = getopt.getopt(argv,"hi:o:r:c:",["ifile=","ofile=","rowlimit=","columnindex="])
50-
51-
# end if no arguments provided
52-
if not opts:
53-
print "No options provided. Try again. Use `-h` for help."
54-
sys.exit()
55-
56-
# grab arguments
57-
for opt, arg in opts:
58-
if opt == '-h':
59-
print 'csvsplit.py -i <inputfile> -r <row limit> -c <column index> -o <outputfile>'
60-
sys.exit()
61-
elif opt in ("-i", "--ifile"):
62-
inputfile = arg
63-
elif opt in ("-o", "--ofile"):
64-
outputfile = arg
65-
elif opt in ("-r", "--rowlimit"):
66-
rowlimit = arg
67-
elif opt in ("-c", "--columnindex"):
68-
columnindex = arg
69-
70-
# Output arguments
71-
print "\nArguments:"
72-
if inputfile:
73-
argument_dict["input_file"] = inputfile
74-
print "Input file is '{}'".format(inputfile)
75-
else:
76-
"Please enter an input file."
77-
if outputfile:
78-
argument_dict["output_file"] = outputfile
79-
print "Output file is '{}'".format(outputfile)
80-
else:
81-
print "Please enter an output file."
82-
if rowlimit:
83-
argument_dict["rowlimit"] = rowlimit
84-
print "Rowlimit is '{}'".format(rowlimit)
85-
if columnindex:
86-
argument_dict["columnindex"] = columnindex
87-
print "Columnindex is '{}'".format(columnindex)
88-
if rowlimit and columnindex:
89-
print "Please use either a rowlimit or columnlimit, not both."
90-
sys.exit()
91-
if not rowlimit or columnindex:
92-
print "Please enter either a rowlimit or columnlimit."
93-
sys.exit()
94-
95-
# to do - check to make sure file, rowlimit, and columnlimit exist
96-
print argument_dict
97-
return argument_dict
98-
99-
100-
def parse_file(argument_dict):
101-
102-
#split csv file by certain rownumber
103-
if argument_dict["rowlimit"]:
104-
rowlimit = int(argument_dict["rowlimit"])
105-
output_name_file = "{}.csv".format(argument_dict["output_file"])
106-
output_path='.'
107-
keep_headers=True
108-
delimiter=','
109-
filehandler = open(argument_dict["input_file"],'r')
110-
reader = csv.reader(filehandler, delimiter=delimiter)
111-
current_piece = 1
112-
current_out_path = os.path.join(
113-
output_path,
114-
output_name_file
32+
33+
def get_arguments():
34+
"""Grab user supplied arguments using the argparse library."""
35+
36+
# Use arparse to get command line arguments
37+
parser = argparse.ArgumentParser()
38+
parser.add_argument("-i", "--input_file", required=True,
39+
help="csv input file (with extension)", type=str)
40+
parser.add_argument("-o", "--output_file", required=True,
41+
help="csv output file (without extension)", type=str)
42+
parser.add_argument("-r", "--row_limit", required=True,
43+
help="row limit to split csv at", type=int)
44+
args = parser.parse_args()
45+
46+
# Check if the input_file exits
47+
is_valid_file(parser, args.input_file)
48+
49+
# Check if the input_file is valid
50+
is_valid_csv(parser, args.input_file, args.row_limit)
51+
52+
return args.input_file, args.output_file, args.row_limit
53+
54+
55+
def is_valid_file(parser, file_name):
56+
"""Ensure that the input_file exists."""
57+
if not os.path.exists(file_name):
58+
parser.error("The file '{}' does not exist!".format(file_name))
59+
sys.exit(1)
60+
61+
62+
def is_valid_csv(parser, file_name, row_limit):
63+
"""
64+
Ensure that the # of rows in the input_file
65+
is greater than the row_limit.
66+
"""
67+
row_count = 0
68+
for row in csv.reader(open(file_name)):
69+
row_count += 1
70+
# Note: You could also use a generator expression
71+
# and the sum() function to count the rows:
72+
# row_count = sum(1 for row in csv.reader(open(file_name)))
73+
if row_limit > row_count:
74+
parser.error(
75+
"The 'row_count' of '{}' is > the number of rows in '{}'!"
76+
.format(row_limit, file_name)
11577
)
116-
current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
117-
current_limit = rowlimit
118-
if keep_headers:
119-
headers = reader.next()
120-
current_out_writer.writerow(headers)
121-
for i, row in enumerate(reader):
122-
if i + 1 > current_limit:
123-
current_piece += 1
124-
current_limit = rowlimit * current_piece
125-
current_out_path = os.path.join(
126-
output_path,
127-
output_name_file
128-
)
129-
current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
130-
131-
# elif columnindex: #split csv file accrording to unique values of certain column,it's like filter only certain item in excel
132-
# itemlist = []
133-
# columnindex = int(columnindex)
134-
# output_name_template= outputfile+'_%s.csv'
135-
# output_path='.'
136-
# keep_headers=True
137-
# delimiter=','
138-
# filehandler = open(inputfile,'r')
139-
# reader = csv.reader(filehandler, delimiter=delimiter)
140-
# if keep_headers:
141-
# headers = reader.next()
142-
143-
# for i, row in enumerate(reader):
144-
145-
# current_out_path = os.path.join(
146-
# output_path,
147-
# output_name_template % row[columnindex] )
148-
# if row[columnindex] not in itemlist:
149-
# try:
150-
# current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
151-
# except IOError:
152-
# continue
153-
# else:
154-
# itemlist.append(row[columnindex])
155-
# if keep_headers:
156-
# current_out_writer.writerow(headers)
157-
# current_out_writer.writerow(row)
158-
# else:
159-
# current_out_writer = csv.writer(open(current_out_path, 'a'), delimiter=delimiter)
160-
# current_out_writer.writerow(row)
161-
# print 'totally %i unique items in column %i \n' % (len(itemlist),columnindex)
162-
# else:
163-
# print "oops, please check instruction of script by >>./csvsplit.py -h"
78+
sys.exit(1)
79+
80+
81+
def parse_file(arguments):
82+
"""
83+
Splits the CSV into multiple files or chunks based on the row_limit.
84+
Then create new CSV files.
85+
"""
86+
input_file = arguments[0]
87+
output_file = arguments[1]
88+
row_limit = arguments[2]
89+
output_path = '.' # Current directory
90+
91+
# Read CSV, split into list of lists
92+
with open(input_file, 'r') as input_csv:
93+
datareader = csv.reader(input_csv)
94+
all_rows = []
95+
for row in datareader:
96+
all_rows.append(row)
97+
98+
# Remove header
99+
header = all_rows.pop(0)
100+
101+
# Split list of list into chunks
102+
current_chunk = 0
103+
for i in range(0, len(all_rows), row_limit): # Loop through list
104+
chunk = all_rows[i:i + row_limit] # Create single chunk
105+
106+
current_output = os.path.join( # Create new output file
107+
output_path,
108+
"{}-{}.csv".format(output_file, current_chunk)
109+
)
110+
111+
# Add header
112+
chunk.insert(0, header)
113+
114+
# Write chunk to output file
115+
with open(current_output, 'w') as output_csv:
116+
writer = csv.writer(output_csv)
117+
writer = writer.writerows(chunk)
118+
119+
# Output info
120+
print ""
121+
print "Chunk # {}:".format(current_chunk)
122+
print "Filepath: {}".format(current_output)
123+
print "# of rows: {}".format(len(chunk))
124+
125+
# Create new chunk
126+
current_chunk += 1
164127

165128

166129
if __name__ == "__main__":
167-
main(sys.argv[1:])
130+
arguments = get_arguments()
131+
parse_file(arguments)

0 commit comments

Comments
 (0)
0