8000 FIX PermissionError in datasets fetchers on Windows · scikit-learn/scikit-learn@6e3d73b · GitHub
[go: up one dir, main page]

Skip to content

Commit 6e3d73b

Browse files
Joan Massichlesteve
Joan Massich
authored andcommitted
FIX PermissionError in datasets fetchers on Windows
PermissionError: [WinError 32] The process cannot access the file because it is being used by another process. This was happening when trying to remove the downloaded archive because the archive was not properly closed.
1 parent 32ac228 commit 6e3d73b

File tree

3 files changed

+47
-44
lines changed

3 files changed

+47
-44
lines changed

sklearn/datasets/california_housing.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949

5050
logger = logging.getLogger(__name__)
5151

52+
5253
def fetch_california_housing(data_home=None, download_if_missing=True):
5354
"""Loader for the California housing dataset from StatLib.
5455
@@ -96,20 +97,21 @@ def fetch_california_housing(data_home=None, download_if_missing=True):
9697

9798
logger.info('Downloading Cal. housing from {} to {}'.format(
9899
ARCHIVE.url, data_home))
100+
99101
archive_path = _fetch_remote(ARCHIVE, dirname=data_home)
100102

101-
fileobj = tarfile.open(
102-
mode="r:gz",
103-
name=archive_path).extractfile(
104-
'CaliforniaHousing/cal_housing.data')
103+
with tarfile.open(mode="r:gz", name=archive_path) as f:
104+
cal_housing = np.loadtxt(
105+
f.extractfile('CaliforniaHousing/cal_housing.data'),
106+
delimiter=',')
107+
# Columns are not in the same order compared to the previous
108+
# URL resource on lib.stat.cmu.edu
109+
columns_index = [8, 7, 2, 3, 4, 5, 6, 1, 0]
110+
cal_housing = cal_housing[:, columns_index]
111+
112+
joblib.dump(cal_housing, filepath, compress=6)
105113
remove(archive_path)
106114

107-
cal_housing = np.loadtxt(fileobj, delimiter=',')
108-
# Columns are not in the same order compared to the previous
109-
# URL resource on lib.stat.cmu.edu
110-
columns_index = [8, 7, 2, 3, 4, 5, 6, 1, 0]
111-
cal_housing = cal_housing[:, columns_index]
112-
joblib.dump(cal_housing, filepath, compress=6)
113115
else:
114116
cal_housing = joblib.load(filepath)
115117

sklearn/datasets/rcv1.py

Lines changed: 21 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -166,21 +166,23 @@ def fetch_rcv1(data_home=None, subset='all', download_if_missing=True,
166166

167167
Xy = load_svmlight_files(files, n_features=N_FEATURES)
168168

169-
# delete archives
170-
for f in files:
171-
remove(f.name)
172-
173169
# Training data is before testing data
174170
X = sp.vstack([Xy[8], Xy[0], Xy[2], Xy[4], Xy[6]]).tocsr()
175171
sample_id = np.hstack((Xy[9], Xy[1], Xy[3], Xy[5], Xy[7]))
176172
sample_id = sample_id.astype(np.uint32)
177173

178174
joblib.dump(X, samples_path, compress=9)
179175
joblib.dump(sample_id, sample_id_path, compress=9)
176+
177+
# delete archives
178+
for f in files:
179+
f.close()
180+
remove(f.name)
180181
else:
181182
X = joblib.load(samples_path)
182183
sample_id = joblib.load(sample_id_path)
183184

185+
184186
# load target (y), categories, and sample_id_bis
185187
if download_if_missing and (not exists(sample_topics_path) or
186188
not exists(topics_path)):
@@ -195,20 +197,21 @@ def fetch_rcv1(data_home=None, subset='all', download_if_missing=True,
195197
y = np.zeros((N_SAMPLES, N_CATEGORIES), dtype=np.uint8)
196198
sample_id_bis = np.zeros(N_SAMPLES, dtype=np.int32)
197199
category_names = {}
198-
for line in GzipFile(filename=topics_archive_path, mode='rb'):
199-
line_components = line.decode("ascii").split(u" ")
200-
if len(line_components) == 3:
201-
cat, doc, _ = line_components
202-
if cat not in category_names:
203-
n_cat += 1
204-
category_names[cat] = n_cat
205-
206-
doc = int(doc)
207-
if doc != doc_previous:
208-
doc_previous = doc
209-
n_doc += 1
210-
sample_id_bis[n_doc] = doc
211-
y[n_doc, category_names[cat]] = 1
200+
with GzipFile(filename=topics_archive_path, mode='rb') as f:
201+
for line in f:
202+
line_components = line.decode("ascii").split(u" ")
203+
if len(line_components) == 3:
204+
cat, doc, _ = line_components
205+
if cat not in category_names:
206+
n_cat += 1
207+
category_names[cat] = n_cat
208+
209+
doc = int(doc)
210+
if doc != doc_previous:
211+
doc_previous = doc
212+
n_doc += 1
213+
sample_id_bis[n_doc] = doc
214+
y[n_doc, category_names[cat]] = 1
212215

213216
# delete archive
214217
remove(topics_archive_path)

sklearn/datasets/species_distributions.py

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -240,29 +240,27 @@ def fetch_species_distributions(data_home=None,
240240
logger.info('Downloading species data from %s to %s' % (
241241
SAMPLES.url, data_home))
242242
samples_path = _fetch_remote(SAMPLES, dirname=data_home)
243-
X = np.load(samples_path) # samples.zip is a valid npz
243+
with np.load(samples_path) as X: # samples.zip is a valid npz
244+
for f in X.files:
245+
fhandle = BytesIO(X[f])
246+
if 'train' in f:
247+
train = _load_csv(fhandle)
248+
if 'test' in f:
249+
test = _load_csv(fhandle)
244250
remove(samples_path)
245251

246-
for f in X.files:
247-
fhandle = BytesIO(X[f])
248-
if 'train' in f:
249-
train = _load_csv(fhandle)
250-
if 'test' in f:
251-
test = _load_csv(fhandle)
252-
253252
logger.info('Downloading coverage data from %s to %s' % (
254253
COVERAGES.url, data_home))
255254
coverages_path = _fetch_remote(COVERAGES, dirname=data_home)
256-
X = np.load(coverages_path) # coverages.zip is a valid npz
255+
with np.load(coverages_path) as X: # coverages.zip is a valid npz
256+
coverages = []
257+
for f in X.files:
258+
fhandle = BytesIO(X[f])
259+
logger.debug(' - converting {}'.format(f))
260+
coverages.append(_load_coverage(fhandle))
261+
coverages = np.asarray(coverages, dtype=dtype)
257262
remove(coverages_path)
258263

259-
coverages = []
260-
for f in X.files:
261-
fhandle = BytesIO(X[f])
262-
logger.debug(' - converting {}'.format(f))
263-
coverages.append(_load_coverage(fhandle))
264-
coverages = np.asarray(coverages, dtype=dtype)
265-
266264
bunch = Bunch(coverages=coverages,
267265
test=test,
268266
train=train,

0 commit comments

Comments
 (0)
0