8000 MNT Use copy=False when creating DataFrames (#26272) · lesteve/scikit-learn@ae78c25 · GitHub
[go: up one dir, main page]

Skip to content

Commit ae78c25

Browse files
authored
MNT Use copy=False when creating DataFrames (scikit-learn#26272)
1 parent 701537e commit ae78c25

File tree

5 files changed

+18
-16
lines changed

5 files changed

+18
-16
lines changed

sklearn/datasets/_arff_parser.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ def _io_to_generator(gzip_file):
187187

188188
# calculate chunksize
189189
first_row = next(arff_container["data"])
190-
first_df = pd.DataFrame([first_row], columns=columns_names)
190+
first_df = pd.DataFrame([first_row], columns=columns_names, copy=False)
191191

192192
row_bytes = first_df.memory_usage(deep=True).sum()
193193
chunksize = get_chunk_n_rows(row_bytes)
@@ -196,7 +196,9 @@ def _io_to_generator(gzip_file):
196196
columns_to_keep = [col for col in columns_names if col in columns_to_select]
197197
dfs = [first_df[columns_to_keep]]
198198
for data in _chunk_generator(arff_container["data"], chunksize):
199-
dfs.append(pd.DataFrame(data, columns=columns_names)[columns_to_keep])
199+
dfs.append(
200+
pd.DataFrame(data, columns=columns_names, copy=False)[columns_to_keep]
201+
)
200202
frame = pd.concat(dfs, ignore_index=True)
201203
del dfs, first_df
202204

sklearn/datasets/_base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ def _convert_data_dataframe(
8686
):
8787
pd = check_pandas_support("{} with as_frame=True".format(caller_name))
8888
if not sparse_data:
89-
data_df = pd.DataFrame(data, columns=feature_names)
89+
data_df = pd.DataFrame(data, columns=feature_names, copy=False)
9090
else:
9191
data_df = pd.DataFrame.sparse.from_spmatrix(data, columns=feature_names)
9292

sklearn/utils/_set_output.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def _wrap_in_pandas_container(
5757
data_to_wrap.columns = columns
5858
return data_to_wrap
5959

60-
return pd.DataFrame(data_to_wrap, index=index, columns=columns)
60+
return pd.DataFrame(data_to_wrap, index=index, columns=columns, copy=False)
6161

6262

6363
def _get_output_config(method, estimator=None):

sklearn/utils/_testing.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -843,7 +843,7 @@ def _convert_container(container, constructor_name, columns_name=None, dtype=Non
843843
return sp.sparse.csr_matrix(container, dtype=dtype)
844844
elif constructor_name == "dataframe":
845845
pd = pytest.importorskip("pandas")
846-
return pd.DataFrame(container, columns=columns_name, dtype=dtype)
846+
return pd.DataFrame(container, columns=columns_name, dtype=dtype, copy=False)
847847
elif constructor_name == "series":
848848
pd = pytest.importorskip("pandas")
849849
return pd.Series(container, dtype=dtype)

sklearn/utils/estimator_checks.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -925,11 +925,11 @@ def check_sample_weights_pandas_series(name, estimator_orig):
925925
[3, 4],
926926
]
927927
)
928-
X = pd.DataFrame(_enforce_estimator_tags_X(estimator_orig, X))
928+
X = pd.DataFrame(_enforce_estimator_tags_X(estimator_orig, X), copy=False)
929929
y = pd.Series([1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2])
930930
weights = pd.Series([1] * 12)
931931
if _safe_tags(estimator, key="multioutput_only"):
932-
y = pd.DataFrame(y)
932+
y = pd.DataFrame(y, copy=False)
933933
try:
934934
estimator.fit(X, y, sample_weight=weights)
935935
except ValueError:
@@ -3218,10 +3218,10 @@ def check_estimators_data_not_an_array(name, estimator_orig, X, y, obj_type):
32183218

32193219
y_ = np.asarray(y)
32203220
if y_.ndim == 1:
3221-
y_ = pd.Series(y_)
3221+
y_ = pd.Series(y_, copy=False)
32223222
else:
3223-
y_ = pd.DataFrame(y_)
3224-
X_ = pd.DataFrame(np.asarray(X))
3223+
y_ = pd.DataFrame(y_, copy=False)
3224+
X_ = pd.DataFrame(np.asarray(X), copy=False)
32253225

32263226
except ImportError:
32273227
raise SkipTest(
@@ -3897,7 +3897,7 @@ def check_dataframe_column_names_consistency(name, estimator_orig):
38973897
n_samples, n_features = X_orig.shape
38983898

38993899
names = np.array([f"col_{i}" for i in range(n_features)])
3900-
X = pd.DataFrame(X_orig, columns=names)
3900+
X = pd.DataFrame(X_orig, columns=names, copy=False)
39013901

39023902
if is_regressor(estimator):
39033903
y = rng.normal(size=n_samples)
@@ -3985,7 +3985,7 @@ def check_dataframe_column_names_consistency(name, estimator_orig):
39853985
early_stopping_enabled = any(value is True for value in params.values())
39863986

39873987
for invalid_name, additional_message in invalid_names:
3988-
X_bad = pd.DataFrame(X, columns=invalid_name)
3988+
X_bad = pd.DataFrame(X, columns=invalid_name, copy=False)
39893989

39903990
expected_msg = re.escape(
39913991
"The feature names should match those that were passed during fit.\n"
@@ -4094,7 +4094,7 @@ def check_transformer_get_feature_names_out_pandas(name, transformer_orig):
40944094
y_[::2, 1] *= 2
40954095

40964096
feature_names_in = [f"col{i}" for i in range(n_features)]
4097-
df = pd.DataFrame(X, columns=feature_names_in)
4097+
df = pd.DataFrame(X, columns=feature_names_in, copy=False)
40984098
X_transform = transformer.fit_transform(df, y=y_)
40994099

41004100
# error is raised when `input_features` do not match feature_names_in
@@ -4324,7 +4324,7 @@ def _check_generated_dataframe(name, case, outputs_default, outputs_pandas):
43244324
# We always rely on the output of `get_feature_names_out` of the
43254325
# transformer used to generate the dataframe as a ground-truth of the
43264326
# columns.
4327-
expected_dataframe = pd.DataFrame(X_trans, columns=feature_names_pandas)
4327+
expected_dataframe = pd.DataFrame(X_trans, columns=feature_names_pandas, copy=False)
43284328

43294329
try:
43304330
pd.testing.assert_frame_equal(df_trans, expected_dataframe)
@@ -4359,7 +4359,7 @@ def check_set_output_transform_pandas(name, transformer_orig):
43594359
set_random_state(transformer)
43604360

43614361
feature_names_in = [f"col{i}" for i in range(X.shape[1])]
4362-
df = pd.DataFrame(X, columns=feature_names_in)
4362+
df = pd.DataFrame(X, columns=feature_names_in, copy=False)
43634363

43644364
transformer_default = clone(transformer).set_output(transform="default")
43654365
outputs_default = _output_from_fit_transform(transformer_default, name, X, df, y)
@@ -4401,7 +4401,7 @@ def check_global_ouptut_transform_pandas(name, transformer_orig):
44014401
set_random_state(transformer)
44024402

44034403
feature_names_in = [f"col{i}" for i in range(X.shape[1])]
4404-
df = pd.DataFrame(X, columns=feature_names_in)
4404+
df = pd.DataFrame(X, columns=feature_names_in, copy=False)
44054405

44064406
transformer_default = clone(transformer).set_output(transform="default")
44074407
outputs_default = _output_from_fit_transform(transformer_default, name, X, df, y)

0 commit comments

Comments
 (0)
0