8000 Merge pull request #11422 from tylerjereddy/loadtxt_csv_asv_bench · numpy/numpy@52c67eb · GitHub
[go: up one dir, main page]

Skip to content

Commit 52c67eb

Browse files
authored
Merge pull request #11422 from tylerjereddy/loadtxt_csv_asv_bench
BENCH: Add benchmarks for np.loadtxt reading from CSV format files
2 parents ee82dc7 + f25b18f commit 52c67eb

File tree

1 file changed

+177
-0
lines changed

1 file changed

+177
-0
lines changed

benchmarks/benchmarks/bench_io.py

Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from .common import Benchmark, get_squares
44

55
import numpy as np
6+
from io import StringIO
67

78

89
class Copy(Benchmark):
@@ -62,3 +63,179 @@ def setup(self):
6263

6364
def time_vb_savez_squares(self):
6465
np.savez('tmp.npz', self.squares)
66+
67+
class LoadtxtCSVComments(Benchmark):
68+
# benchmarks for np.loadtxt comment handling
69+
# when reading in CSV files
70+
71+
params = [10, int(1e2), int(1e4), int(1e5)]
72+
param_names = ['num_lines']
73+
74+
def setup(self, num_lines):
75+
data = [u'1,2,3 # comment'] * num_lines
76+
# unfortunately, timeit will only run setup()
77+
# between repeat events, but not for iterations
78+
# within repeats, so the StringIO object
79+
# will have to be rewinded in the benchmark proper
80+
self.data_comments = StringIO(u'\n'.join(data))
81+
82+
def time_comment_loadtxt_csv(self, num_lines):
83+
# benchmark handling of lines with comments
84+
# when loading in from csv files
85+
86+
# inspired by similar benchmark in pandas
87+
# for read_csv
88+
89+
# need to rewind StringIO object (unfortunately
90+
# confounding timing result somewhat) for every
91+
# call to timing test proper
92+
np.loadtxt(self.data_comments,
93+
delimiter=u',')
94+
self.data_comments.seek(0)
95+
96+
class LoadtxtCSVdtypes(Benchmark):
97+
# benchmarks for np.loadtxt operating with
98+
# different dtypes parsed / cast from CSV files
99+
100+
params = (['float32', 'float64', 'int32', 'int64',
101+
'complex128', 'str', 'object'],
102+
[10, int(1e2), int(1e4), int(1e5)])
103+
param_names = ['dtype', 'num_lines']
104+
105+
def setup(self, dtype, num_lines):
106+
data = [u'5, 7, 888'] * num_lines
107+
self.csv_data = StringIO(u'\n'.join(data))
108+
109+
def time_loadtxt_dtypes_csv(self, dtype, num_lines):
110+
# benchmark loading arrays of various dtypes
111+
# from csv files
112+
113+
# state-dependent timing benchmark requires
114+
# rewind of StringIO object
115+
116+
np.loadtxt(self.csv_data,
117+
delimiter=u',',
118+
dtype=dtype)
119+
self.csv_data.seek(0)
120+
121+
class LoadtxtCSVStructured(Benchmark):
122+
# benchmarks for np.loadtxt operating with
123+
# a structured data type & CSV file
124+
125+
def setup(self):
126+
num_lines = 50000
127+
data = [u"M, 21, 72, X, 155"] * num_lines
128+
self.csv_data = StringIO(u'\n'.join(data))
129+
130+
def time_loadtxt_csv_struct_dtype(self):
131+
# obligate rewind of StringIO object
132+
# between iterations of a repeat:
133+
134+
np.loadtxt(self.csv_data,
135+
delimiter=u',',
136+
dtype=[('category_1', 'S1'),
137+
('category_2', 'i4'),
138+
('category_3', 'f8'),
139+
('category_4', 'S1'),
140+
('category_5', 'f8')])
141+
self.csv_data.seek(0)
142+
143+
144+
class LoadtxtCSVSkipRows(Benchmark):
145+
# benchmarks for loadtxt row skipping when
146+
# reading in csv file data; a similar benchmark
147+
# is present in the pandas asv suite
148+
149+
params = [0, 500, 10000]
150+
param_names = ['skiprows']
151+
152+
def setup(self, skiprows):
153+
np.random.seed(123)
154+
test_array = np.random.rand(100000, 3)
155+
self.fname = 'test_array.csv'
156+
np.savetxt(fname=self.fname,
157+
X=test_array,
158+
delimiter=',')
159+
160+
def time_skiprows_csv(self, skiprows):
161+
np.loadtxt(self.fname,
162+
delimiter=',',
163+
skiprows=skiprows)
164+
165+
class LoadtxtReadUint64Integers(Benchmark):
166+
# pandas has a similar CSV reading benchmark
167+
# modified to suit np.loadtxt
168+
169+
params = [550, 1000, 10000]
170+
param_names = ['size']
171+
172+
def setup(self, size):
173+
arr = np.arange(size).astype('uint64') + 2**63
174+
self.data1 = StringIO(u'\n'.join(arr.astype(str).tolist()))
175+
arr = arr.astype(object)
176+
arr[500] = -1
177+
self.data2 = StringIO(u'\n'.join(arr.astype(str).tolist()))
178+
179+
def time_read_uint64(self, size):
180+
# mandatory rewind of StringIO object
181+
# between iterations of a repeat:
182+
np.loadtxt(self.data1)
183+
self.data1.seek(0)
184+
185+
def time_read_uint64_neg_values(self, size):
186+
# mandatory rewind of StringIO object
187+
# between iterations of a repeat:
188+
np.loadtxt(self.data2)
189+
self.data2.seek(0)
190+
191+
class LoadtxtUseColsCSV(Benchmark):
192+
# benchmark selective column reading from CSV files
193+
# using np.loadtxt
194+
195+
params = [2, [1, 3], [1, 3, 5, 7]]
196+
param_names = ['usecols']
197+
198+
def setup(self, usecols):
199+
num_lines = 5000
200+
data = [u'0, 1, 2, 3, 4, 5, 6, 7, 8, 9'] * num_lines
201+
self.csv_data = StringIO(u'\n'.join(data))
202+
203+
def time_loadtxt_usecols_csv(self, usecols):
204+
# must rewind StringIO because of state
205+
# dependence of file reading
206+
np.loadtxt(self.csv_data,
207+
delimiter=u',',
208+
usecols=usecols)
209+
self.csv_data.seek(0)
210+
211+
class LoadtxtCSVDateTime(Benchmark):
212+
# benchmarks for np.loadtxt operating with
213+
# datetime data in a CSV file
214+
215+
params = [20, 200, 2000, 20000]
216+
param_names = ['num_lines']
217+
218+
def setup(self, num_lines):
219+
# create the equivalent of a two-column CSV file
220+
# with date strings in the first column and random
221+
# floating point data in the second column
222+
dates = np.arange('today', 20, dtype=np.datetime64)
223+
np.random.seed(123)
224+
values = np.random.rand(20)
225+
date_line = u''
226+
227+
for date, value in zip(dates, values):
228+
date_line += (str(date) + ',' + str(value) + '\n')
229+
230+
# expand data to specified number of lines
231+
data = date_line * (num_lines // 20)
232+
self.csv_data = StringIO(data)
233+
234+
def time_loadtxt_csv_datetime(self, num_lines):
235+
# rewind StringIO object -- the timing iterations
236+
# are state-dependent
237+
X = np.loadtxt(self.csv_data,
238+
delimiter=u',',
239+
dtype=([('dates', 'M8[us]'),
240+
('values', 'float64')]))
241+
self.csv_data.seek(0)

0 commit comments

Comments
 (0)
0