8000 FIX PermissionError in datasets fetchers on Windows (#9847) · scikit-learn/scikit-learn@534f68b · GitHub
[go: up one dir, main page]

Skip to content

Commit 534f68b

Browse files
massichjnothman
authored andcommitted
FIX PermissionError in datasets fetchers on Windows (#9847)
1 parent 2109c37 commit 534f68b

File tree

3 files changed

+47
-44
lines changed

3 files changed

+47
-44
lines changed

sklearn/datasets/california_housing.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949

5050
logger = logging.getLogger(__name__)
5151

52+
5253
def fetch_california_housing(data_home=None, download_if_missing=True):
5354
"""Loader for the California housing dataset from StatLib.
5455
@@ -96,20 +97,21 @@ def fetch_california_housing(data_home=None, download_if_missing=True):
9697

9798
logger.info('Downloading Cal. housing from {} to {}'.format(
9899
ARCHIVE.url, data_home))
100+
99101
archive_path = _fetch_remote(ARCHIVE, dirname=data_home)
100102

101-
fileobj = tarfile.open(
102-
mode="r:gz",
103-
name=archive_path).extractfile(
104-
'CaliforniaHousing/cal_housing.data')
103+
with tarfile.open(mode="r:gz", name=archive_path) as f:
104+
cal_housing = np.loadtxt(
105+
f.extractfile('CaliforniaHousing/cal_housing.data'),
106+
delimiter=',')
107+
# Columns are not in the same order compared to the previous
108+
# URL resource on lib.stat.cmu.edu
109+
columns_index = [8, 7, 2, 3, 4, 5, 6, 1, 0]
110+
cal_housing = cal_housing[:, columns_index]
111+
112+
joblib.dump(cal_housing, filepath, compress=6)
105113
remove(archive_path)
106114

107-
cal_housing = np.loadtxt(fileobj, delimiter=',')
108-
# Columns are not in the same order compared to the previous
109-
# URL resource on lib.stat.cmu.edu
110-
columns_index = [8, 7, 2, 3, 4, 5, 6, 1, 0]
111-
cal_housing = cal_housing[:, columns_index]
112-
joblib.dump(cal_housing, filepath, compress=6)
113115
else:
114116
cal_housing = joblib.load(filepath)
115117

sklearn/datasets/rcv1.py

Lines changed: 21 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -166,21 +166,23 @@ def fetch_rcv1(data_home=None, subset='all', download_if_missing=True,
166166

167167
Xy = load_svmlight_files(files, n_features=N_FEATURES)
168168

169-
# delete archives
170-
for f in files:
171-
remove(f.name)
172-
173169
# Training data is before testing data
174170
X = sp.vstack([Xy[8], Xy[0], Xy[2], Xy[4], Xy[6]]).tocsr()
175171
sample_id = np.hstack((Xy[9], Xy[1], Xy[3], Xy[5], Xy[7]))
176172
sample_id = sample_id.astype(np.uint32)
177173

178174
joblib.dump(X, samples_path, compress=9)
179175
joblib.dump(sample_id, sample_id_path, compress=9)
176+
177+
# delete archives
178+
for f in files:
179+
f.close()
180+
remove(f.name)
180181
else:
181182
X = joblib.load(samples_path)
182183
sample_id = joblib.load(sample_id_path)
183184

185+
184186
# load target (y), categories, and sample_id_bis
185187
if download_if_missing and (not exists(sample_topics_path) or
186188
not exists(topics_path)):
@@ -195,20 +197,21 @@ def fetch_rcv1(data_home=None, subset='all', download_if_missing=True,
195197
y = np.zeros((N_SAMPLES, N_CATEGORIES), dtype=np.uint8)
196198
sample_id_bis = np.zeros(N_SAMPLES, dtype=np.int32)
197199
category_names = {}
198-
for line in GzipFile(filename=topics_archive_path, mode='rb'):
199-
line_components = line.decode("ascii").split(u" ")
200-
if len(line_components) == 3:
201-
cat, doc, _ = line_components
202-
if cat not in category_names:
203-
n_cat += 1
204-
category_names[cat] = n_cat
205-
206-
doc = int(doc)
207-
if doc != doc_previous:
208-
doc_previous = doc
209-
n_doc += 1
210-
sample_id_bis[n_doc] = doc
211-
y[n_doc, category_names[cat]] = 1
200+
with GzipFile(filename=topics_archive_path, mode='rb') as f:
201+
for line in f:
202+
line_components = line.decode("ascii").split(u" ")
203+
if len(line_components) == 3:
204+
cat, doc, _ = line_components
205+
if cat not in category_names:
206+
n_cat += 1
207+
category_names[cat] = n_cat
208+
209+
doc = int(doc)
210+
if doc != doc_previous:
211+
doc_previous = doc
212+
n_doc += 1
213+
sample_id_bis[n_doc] = doc
214+
y[n_doc, category_names[cat]] = 1
212215

213216
# delete archive
214217
remove(topics_archive_path)

sklearn/datasets/species_distributions.py

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -240,29 +240,27 @@ def fetch_species_distributions(data_home=None,
240240
logger.info('Downloading species data from %s to %s' % (
241241
SAMPLES.url, data_home))
242242
samples_path = _fetch_remote(SAMPLES, dirname=data_home)
243-
X = np.load(samples_path) # samples.zip is a valid npz
243+
with np.load(samples_path) as X: # samples.zip is a valid npz
244+
for f in X.files:
245+
fhandle = BytesIO(X[f])
246+
if 'train' in f:
247+
train = _load_csv(fhandle)
248+
if 'test' in f:
249+
test = _load_csv(fhandle)
244250
remove(samples_path)
245251

246-
for f in X.files:
247-
fhandle = BytesIO(X[f])
248-
if 'train' in f:
249-
train = _load_csv(fhandle)
250-
if 'test' in f:
251-
test = _load_csv(fhandle)
252-
253252
logger.info('Downloading coverage data from %s to %s' % (
254253
COVERAGES.url, data_home))
255254
coverages_path = _fetch_remote(COVERAGES, dirname=data_home)
256-
X = np.load(coverages_path) # coverages.zip is a valid npz
255+
with np.load(coverages_path) as X: # coverages.zip is a valid npz
256+
coverages = []
257+
for f in X.files:
258+
fhandle = BytesIO(X[f])
259+
logger.debug(' - converting {}'.format(f))
260+
coverages.append(_load_coverage(fhandle))
261+
coverages = np.asarray(coverages, dtype=dtype)
257262
remove(coverages_path)
258263

259-
coverages = []
260-
for f in X.files:
261-
fhandle = BytesIO(X[f])
262-
logger.debug(' - converting {}'.format(f))
263-
coverages.append(_load_coverage(fhandle))
264-
coverages = np.asarray(coverages, dtype=dtype)
265-
266264
bunch = Bunch(coverages=coverages,
267265
test=test,
268266
train=train,

0 commit comments

Comments
 (0)
0