From 22ae7890ddc15422431a689f5f7e6aff65e2917b Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 20 Jan 2024 21:07:16 +0100 Subject: [PATCH 01/60] Backport PR #56980 on branch 2.2.x (WEB: Add version 2.2 to the dropdown) (#56983) Backport PR #56980: WEB: Add version 2.2 to the dropdown Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- web/pandas/versions.json | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/web/pandas/versions.json b/web/pandas/versions.json index e355005c7c937..09b3d8492a2c7 100644 --- a/web/pandas/versions.json +++ b/web/pandas/versions.json @@ -5,11 +5,16 @@ "url": "https://pandas.pydata.org/docs/dev/" }, { - "name": "2.1 (stable)", - "version": "2.1", + "name": "2.2 (stable)", + "version": "2.2", "url": "https://pandas.pydata.org/docs/", "preferred": true }, + { + "name": "2.1", + "version": "2.1", + "url": "https://pandas.pydata.org/pandas-docs/version/2.1/", + }, { "name": "2.0", "version": "2.0", From 6c563e3299543746befc6822fe53dea5ddc48979 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 20 Jan 2024 21:36:53 +0100 Subject: [PATCH 02/60] Backport PR #56986 on branch 2.2.x (WEB: Fix typo in dropdown page) (#56987) Backport PR #56986: WEB: Fix typo in dropdown page Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- web/pandas/versions.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web/pandas/versions.json b/web/pandas/versions.json index 09b3d8492a2c7..2d2599ae8585b 100644 --- a/web/pandas/versions.json +++ b/web/pandas/versions.json @@ -13,7 +13,7 @@ { "name": "2.1", "version": "2.1", - "url": "https://pandas.pydata.org/pandas-docs/version/2.1/", + "url": "https://pandas.pydata.org/pandas-docs/version/2.1/" }, { "name": "2.0", From bfe6c4f057e477544ee53a346d615cf875971aaf Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 22 Jan 2024 19:24:51 +0100 Subject: [PATCH 03/60] Backport PR #56982 on branch 2.2.x (DOC: Add release notes for 2.2.1) (#56998) Backport PR #56982: DOC: Add release notes for 2.2.1 Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- doc/source/whatsnew/index.rst | 1 + doc/source/whatsnew/v2.2.1.rst | 36 ++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 doc/source/whatsnew/v2.2.1.rst diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index ec024f36d78b1..3a2ab4c17d1bd 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -16,6 +16,7 @@ Version 2.2 .. toctree:: :maxdepth: 2 + v2.2.1 v2.2.0 Version 2.1 diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst new file mode 100644 index 0000000000000..22a6c0cd1028f --- /dev/null +++ b/doc/source/whatsnew/v2.2.1.rst @@ -0,0 +1,36 @@ +.. _whatsnew_221: + +What's new in 2.2.1 (February XX, 2024) +--------------------------------------- + +These are the changes in pandas 2.2.1. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- +.. _whatsnew_221.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_221.bug_fixes: + +Bug fixes +~~~~~~~~~ +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_221.other: + +Other +~~~~~ +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_221.contributors: + +Contributors +~~~~~~~~~~~~ From 987dcbbce3c9bdb5e422ff6dee65bd057064b7dc Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 22 Jan 2024 21:10:25 +0100 Subject: [PATCH 04/60] Backport PR #57005 on branch 2.2.x (CI: pyarrow nightly failures) (#57013) Backport PR #57005: CI: pyarrow nightly failures Co-authored-by: Luke Manley --- pandas/tests/frame/test_arrow_interface.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/test_arrow_interface.py b/pandas/tests/frame/test_arrow_interface.py index ac7b51cbdfa92..098d1829b973c 100644 --- a/pandas/tests/frame/test_arrow_interface.py +++ b/pandas/tests/frame/test_arrow_interface.py @@ -35,11 +35,11 @@ def test_dataframe_arrow_interface(): def test_dataframe_to_arrow(): df = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) - table = pa.RecordBatchReader.from_stream(df) + table = pa.RecordBatchReader.from_stream(df).read_all() expected = pa.table({"a": [1, 2, 3], "b": ["a", "b", "c"]}) assert table.equals(expected) schema = pa.schema([("a", pa.int8()), ("b", pa.string())]) - table = pa.RecordBatchReader.from_stream(df, schema=schema) + table = pa.RecordBatchReader.from_stream(df, schema=schema).read_all() expected = expected.cast(schema) assert table.equals(expected) From 662e3f8632f112b3f85c68b62445766c8186396c Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 23 Jan 2024 01:22:04 +0100 Subject: [PATCH 05/60] Backport PR #57011 on branch 2.2.x (Remove SKIP summary from CI logs) (#57020) Backport PR #57011: Remove SKIP summary from CI logs Co-authored-by: William Ayd --- ci/run_tests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/run_tests.sh b/ci/run_tests.sh index 48ef21686a26f..39ab0890a32d1 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -10,7 +10,7 @@ echo PYTHONHASHSEED=$PYTHONHASHSEED COVERAGE="-s --cov=pandas --cov-report=xml --cov-append --cov-config=pyproject.toml" -PYTEST_CMD="MESONPY_EDITABLE_VERBOSE=1 PYTHONDEVMODE=1 PYTHONWARNDEFAULTENCODING=1 pytest -r fEs -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE $PYTEST_TARGET" +PYTEST_CMD="MESONPY_EDITABLE_VERBOSE=1 PYTHONDEVMODE=1 PYTHONWARNDEFAULTENCODING=1 pytest -r fE -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE $PYTEST_TARGET" if [[ "$PATTERN" ]]; then PYTEST_CMD="$PYTEST_CMD -m \"$PATTERN\"" From 3b833cfcc81ba55a1f46603b06fb9a043aa1bba9 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 23 Jan 2024 01:22:19 +0100 Subject: [PATCH 06/60] Backport PR #57018 on branch 2.2.x (REGR: merge_ordered with fill_method="ffill" and how="left") (#57021) Backport PR #57018: REGR: merge_ordered with fill_method="ffill" and how="left" Co-authored-by: Luke Manley --- doc/source/whatsnew/v2.2.1.rst | 2 +- pandas/core/reshape/merge.py | 7 +++--- .../tests/reshape/merge/test_merge_ordered.py | 23 +++++++++++++++++++ 3 files changed, 27 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 22a6c0cd1028f..75445c978d262 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -13,7 +13,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ -- +- Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) .. --------------------------------------------------------------------------- .. _whatsnew_221.bug_fixes: diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 410301b7697f2..646f40f6141d8 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1930,10 +1930,9 @@ def get_result(self, copy: bool | None = True) -> DataFrame: if self.fill_method == "ffill": if left_indexer is None: - raise TypeError("left_indexer cannot be None") - left_indexer = cast("npt.NDArray[np.intp]", left_indexer) - right_indexer = cast("npt.NDArray[np.intp]", right_indexer) - left_join_indexer = libjoin.ffill_indexer(left_indexer) + left_join_indexer = None + else: + left_join_indexer = libjoin.ffill_indexer(left_indexer) if right_indexer is None: right_join_indexer = None else: diff --git a/pandas/tests/reshape/merge/test_merge_ordered.py b/pandas/tests/reshape/merge/test_merge_ordered.py index abd61026b4e37..0bd3ca3cf2c1b 100644 --- a/pandas/tests/reshape/merge/test_merge_ordered.py +++ b/pandas/tests/reshape/merge/test_merge_ordered.py @@ -219,3 +219,26 @@ def test_ffill_validate_fill_method(self, left, right, invalid_method): ValueError, match=re.escape("fill_method must be 'ffill' or None") ): merge_ordered(left, right, on="key", fill_method=invalid_method) + + def test_ffill_left_merge(self): + # GH 57010 + df1 = DataFrame( + { + "key": ["a", "c", "e", "a", "c", "e"], + "lvalue": [1, 2, 3, 1, 2, 3], + "group": ["a", "a", "a", "b", "b", "b"], + } + ) + df2 = DataFrame({"key": ["b", "c", "d"], "rvalue": [1, 2, 3]}) + result = merge_ordered( + df1, df2, fill_method="ffill", left_by="group", how="left" + ) + expected = DataFrame( + { + "key": ["a", "c", "e", "a", "c", "e"], + "lvalue": [1, 2, 3, 1, 2, 3], + "group": ["a", "a", "a", "b", "b", "b"], + "rvalue": [np.nan, 2.0, 2.0, np.nan, 2.0, 2.0], + } + ) + tm.assert_frame_equal(result, expected) From fb2cf0fca6f231d1c24fdbed06035457e47f3970 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 25 Jan 2024 17:12:31 +0100 Subject: [PATCH 07/60] Backport PR #57057 on branch 2.2.x (COMPAT: Make argsort compatable with numpy 2.0) (#57062) Backport PR #57057: COMPAT: Make argsort compatable with numpy 2.0 Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/compat/numpy/function.py | 2 ++ pandas/core/indexes/base.py | 2 +- pandas/core/series.py | 3 +++ 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index a36e25a9df410..4df30f7f4a8a7 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -138,6 +138,7 @@ def validate_argmax_with_skipna(skipna: bool | ndarray | None, args, kwargs) -> ARGSORT_DEFAULTS["kind"] = "quicksort" ARGSORT_DEFAULTS["order"] = None ARGSORT_DEFAULTS["kind"] = None +ARGSORT_DEFAULTS["stable"] = None validate_argsort = CompatValidator( @@ -149,6 +150,7 @@ def validate_argmax_with_skipna(skipna: bool | ndarray | None, args, kwargs) -> ARGSORT_DEFAULTS_KIND: dict[str, int | None] = {} ARGSORT_DEFAULTS_KIND["axis"] = -1 ARGSORT_DEFAULTS_KIND["order"] = None +ARGSORT_DEFAULTS_KIND["stable"] = None validate_argsort_kind = CompatValidator( ARGSORT_DEFAULTS_KIND, fname="argsort", max_fname_arg_count=0, method="both" ) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 88a08dd55f739..c36dcda6e2972 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -956,7 +956,7 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str_t, *inputs, **kwargs): return self.__array_wrap__(result) @final - def __array_wrap__(self, result, context=None): + def __array_wrap__(self, result, context=None, return_scalar=False): """ Gets called after a ufunc and other functions e.g. np.split. """ diff --git a/pandas/core/series.py b/pandas/core/series.py index 83eb545b9b681..f06a1eb533ba4 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4066,6 +4066,7 @@ def argsort( axis: Axis = 0, kind: SortKind = "quicksort", order: None = None, + stable: None = None, ) -> Series: """ Return the integer indices that would sort the Series values. @@ -4082,6 +4083,8 @@ def argsort( information. 'mergesort' and 'stable' are the only stable algorithms. order : None Has no effect but is accepted for compatibility with numpy. + stable : None + Has no effect but is accepted for compatibility with numpy. Returns ------- From 2df78e8821d0e8d5ef51e3842db142faf32a64c1 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 25 Jan 2024 17:13:44 +0100 Subject: [PATCH 08/60] Backport PR #57058 on branch 2.2.x (BUG: Series.pct_change with empty Series) (#57059) Backport PR #57058: BUG: Series.pct_change with empty Series Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/generic.py | 27 ++++++++++--------- .../tests/series/methods/test_pct_change.py | 8 ++++++ 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 75445c978d262..d3065cdd8b624 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -14,6 +14,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) +- Fixed regression in :meth:`Series.pct_change` raising a ``ValueError`` for an empty :class:`Series` (:issue:`57056`) .. --------------------------------------------------------------------------- .. _whatsnew_221.bug_fixes: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f8728c61e46fc..55693d4cdb753 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -12126,19 +12126,20 @@ def pct_change( if limit is lib.no_default: cols = self.items() if self.ndim == 2 else [(None, self)] for _, col in cols: - mask = col.isna().values - mask = mask[np.argmax(~mask) :] - if mask.any(): - warnings.warn( - "The default fill_method='pad' in " - f"{type(self).__name__}.pct_change is deprecated and will " - "be removed in a future version. Either fill in any " - "non-leading NA values prior to calling pct_change or " - "specify 'fill_method=None' to not fill NA values.", - FutureWarning, - stacklevel=find_stack_level(), - ) - break + if len(col) > 0: + mask = col.isna().values + mask = mask[np.argmax(~mask) :] + if mask.any(): + warnings.warn( + "The default fill_method='pad' in " + f"{type(self).__name__}.pct_change is deprecated and " + "will be removed in a future version. Either fill in " + "any non-leading NA values prior to calling pct_change " + "or specify 'fill_method=None' to not fill NA values.", + FutureWarning, + stacklevel=find_stack_level(), + ) + break fill_method = "pad" if limit is lib.no_default: limit = None diff --git a/pandas/tests/series/methods/test_pct_change.py b/pandas/tests/series/methods/test_pct_change.py index 9727ef3d5c27c..6c80e711c3684 100644 --- a/pandas/tests/series/methods/test_pct_change.py +++ b/pandas/tests/series/methods/test_pct_change.py @@ -118,3 +118,11 @@ def test_pct_change_no_warning_na_beginning(): result = ser.pct_change() expected = Series([np.nan, np.nan, np.nan, 1, 0.5]) tm.assert_series_equal(result, expected) + + +def test_pct_change_empty(): + # GH 57056 + ser = Series([], dtype="float64") + expected = ser.copy() + result = ser.pct_change(periods=0) + tm.assert_series_equal(expected, result) From 441f65df5c6dea850e6334ab770528184dfb0d33 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 25 Jan 2024 19:03:14 +0100 Subject: [PATCH 09/60] Backport PR #57034 on branch 2.2.x (REGR: perf regression in Series.combine_first) (#57072) Backport PR #57034: REGR: perf regression in Series.combine_first Co-authored-by: Luke Manley --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/series.py | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index d3065cdd8b624..93965ffed23d3 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -13,6 +13,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ +- Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`) - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) - Fixed regression in :meth:`Series.pct_change` raising a ``ValueError`` for an empty :class:`Series` (:issue:`57056`) diff --git a/pandas/core/series.py b/pandas/core/series.py index f06a1eb533ba4..c6a905cbb6ec1 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -86,6 +86,7 @@ from pandas.core.dtypes.dtypes import ( CategoricalDtype, ExtensionDtype, + SparseDtype, ) from pandas.core.dtypes.generic import ( ABCDataFrame, @@ -3510,6 +3511,13 @@ def combine_first(self, other) -> Series: """ from pandas.core.reshape.concat import concat + if self.dtype == other.dtype: + if self.index.equals(other.index): + return self.mask(self.isna(), other) + elif self._can_hold_na and not isinstance(self.dtype, SparseDtype): + this, other = self.align(other, join="outer") + return this.mask(this.isna(), other) + new_index = self.index.union(other.index) this = self From c45129431ef4b089b14f0f3a892cc115e155315b Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 26 Jan 2024 19:41:39 +0100 Subject: [PATCH 10/60] Backport PR #57046 on branch 2.2.x (REGR: groupby.idxmin/idxmax wrong result on extreme values) (#57086) Backport PR #57046: REGR: groupby.idxmin/idxmax wrong result on extreme values Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 2 + pandas/_libs/groupby.pyx | 17 ++++--- pandas/core/groupby/ops.py | 1 + pandas/tests/groupby/test_reductions.py | 62 +++++++++++++++++++++++++ 4 files changed, 75 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 93965ffed23d3..23da4a7f6ab25 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -15,6 +15,8 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`) - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) +- Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) +- Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) - Fixed regression in :meth:`Series.pct_change` raising a ``ValueError`` for an empty :class:`Series` (:issue:`57056`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 19d71b0a6fde3..ac24c0bb8df88 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1767,6 +1767,7 @@ def group_idxmin_idxmax( Py_ssize_t i, j, N, K, lab numeric_object_t val numeric_object_t[:, ::1] group_min_or_max + uint8_t[:, ::1] seen bint uses_mask = mask is not None bint isna_entry bint compute_max = name == "idxmax" @@ -1780,13 +1781,10 @@ def group_idxmin_idxmax( if numeric_object_t is object: group_min_or_max = np.empty((out).shape, dtype=object) + seen = np.zeros((out).shape, dtype=np.uint8) else: group_min_or_max = np.empty_like(out, dtype=values.dtype) - if N > 0 and K > 0: - # When N or K is zero, we never use group_min_or_max - group_min_or_max[:] = _get_min_or_max( - values[0, 0], compute_max, is_datetimelike - ) + seen = np.zeros_like(out, dtype=np.uint8) # When using transform, we need a valid value for take in the case # a category is not observed; these values will be dropped @@ -1802,6 +1800,7 @@ def group_idxmin_idxmax( if not skipna and out[lab, j] == -1: # Once we've hit NA there is no going back continue + val = values[i, j] if uses_mask: @@ -1810,10 +1809,14 @@ def group_idxmin_idxmax( isna_entry = _treat_as_na(val, is_datetimelike) if isna_entry: - if not skipna: + if not skipna or not seen[lab, j]: out[lab, j] = -1 else: - if compute_max: + if not seen[lab, j]: + seen[lab, j] = True + group_min_or_max[lab, j] = val + out[lab, j] = i + elif compute_max: if val > group_min_or_max[lab, j]: group_min_or_max[lab, j] = val out[lab, j] = i diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 5e83eaee02afc..e2ddf9aa5c0c1 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -424,6 +424,7 @@ def _call_cython_op( mask=mask, result_mask=result_mask, is_datetimelike=is_datetimelike, + **kwargs, ) elif self.how in ["sem", "std", "var", "ohlc", "prod", "median"]: if self.how in ["std", "sem"]: diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 425079f943aba..422322d03c4c0 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -195,6 +195,68 @@ def test_empty(frame_or_series, bool_agg_func): tm.assert_equal(result, expected) +@pytest.mark.parametrize("how", ["idxmin", "idxmax"]) +def test_idxmin_idxmax_extremes(how, any_real_numpy_dtype): + # GH#57040 + if any_real_numpy_dtype is int or any_real_numpy_dtype is float: + # No need to test + return + info = np.iinfo if "int" in any_real_numpy_dtype else np.finfo + min_value = info(any_real_numpy_dtype).min + max_value = info(any_real_numpy_dtype).max + df = DataFrame( + {"a": [2, 1, 1, 2], "b": [min_value, max_value, max_value, min_value]}, + dtype=any_real_numpy_dtype, + ) + gb = df.groupby("a") + result = getattr(gb, how)() + expected = DataFrame( + {"b": [1, 0]}, index=pd.Index([1, 2], name="a", dtype=any_real_numpy_dtype) + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("how", ["idxmin", "idxmax"]) +def test_idxmin_idxmax_extremes_skipna(skipna, how, float_numpy_dtype): + # GH#57040 + min_value = np.finfo(float_numpy_dtype).min + max_value = np.finfo(float_numpy_dtype).max + df = DataFrame( + { + "a": Series(np.repeat(range(1, 6), repeats=2), dtype="intp"), + "b": Series( + [ + np.nan, + min_value, + np.nan, + max_value, + min_value, + np.nan, + max_value, + np.nan, + np.nan, + np.nan, + ], + dtype=float_numpy_dtype, + ), + }, + ) + gb = df.groupby("a") + + warn = None if skipna else FutureWarning + msg = f"The behavior of DataFrameGroupBy.{how} with all-NA values" + with tm.assert_produces_warning(warn, match=msg): + result = getattr(gb, how)(skipna=skipna) + if skipna: + values = [1, 3, 4, 6, np.nan] + else: + values = np.nan + expected = DataFrame( + {"b": values}, index=pd.Index(range(1, 6), name="a", dtype="intp") + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "func, values", [ From b44512726e9d55421f67683ff6d26ec7ce82046b Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 26 Jan 2024 20:50:08 +0100 Subject: [PATCH 11/60] Backport PR #57084 on branch 2.2.x (Fix mem leak in read_csv) (#57090) Backport PR #57084: Fix mem leak in read_csv Co-authored-by: William Ayd --- asv_bench/benchmarks/io/csv.py | 3 +++ doc/source/whatsnew/v2.2.1.rst | 1 + pandas/_libs/src/parser/tokenizer.c | 9 +++++++++ 3 files changed, 13 insertions(+) diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 9ac83db4f85b9..dae6107db4d92 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -408,6 +408,9 @@ def time_read_stringcsv(self, engine): def time_read_bytescsv(self, engine): read_csv(self.data(self.BytesIO_input), engine=engine) + def peakmem_read_csv(self, engine): + read_csv(self.data(self.BytesIO_input), engine=engine) + class ReadCSVCategorical(BaseIO): fname = "__test__.csv" diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 23da4a7f6ab25..b9b2821ebc468 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -13,6 +13,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ +- Fixed memory leak in :func:`read_csv` (:issue:`57039`) - Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`) - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 0e4188bea4dc7..c9f7a796a9b1c 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -109,6 +109,14 @@ void parser_set_default_options(parser_t *self) { parser_t *parser_new(void) { return (parser_t *)calloc(1, sizeof(parser_t)); } +static void parser_clear_data_buffers(parser_t *self) { + free_if_not_null((void *)&self->stream); + free_if_not_null((void *)&self->words); + free_if_not_null((void *)&self->word_starts); + free_if_not_null((void *)&self->line_start); + free_if_not_null((void *)&self->line_fields); +} + static void parser_cleanup(parser_t *self) { // XXX where to put this free_if_not_null((void *)&self->error_msg); @@ -119,6 +127,7 @@ static void parser_cleanup(parser_t *self) { self->skipset = NULL; } + parser_clear_data_buffers(self); if (self->cb_cleanup != NULL) { self->cb_cleanup(self->source); self->cb_cleanup = NULL; From 1550858f6e34f85bb9d489891d4c5ac3146b01b6 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 27 Jan 2024 20:38:01 +0100 Subject: [PATCH 12/60] Backport PR #57078 on branch 2.2.x (54628 fix find stack level memory leak) (#57105) Backport PR #57078: 54628 fix find stack level memory leak Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/_testing/_warnings.py | 7 ++++++- pandas/util/_exceptions.py | 24 +++++++++++++++--------- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/pandas/_testing/_warnings.py b/pandas/_testing/_warnings.py index f11dc11f6ac0d..c9a287942f2da 100644 --- a/pandas/_testing/_warnings.py +++ b/pandas/_testing/_warnings.py @@ -218,7 +218,12 @@ def _assert_raised_with_correct_stacklevel( frame = inspect.currentframe() for _ in range(4): frame = frame.f_back # type: ignore[union-attr] - caller_filename = inspect.getfile(frame) # type: ignore[arg-type] + try: + caller_filename = inspect.getfile(frame) # type: ignore[arg-type] + finally: + # See note in + # https://docs.python.org/3/library/inspect.html#inspect.Traceback + del frame msg = ( "Warning not set with correct stacklevel. " f"File where warning is raised: {actual_warning.filename} != " diff --git a/pandas/util/_exceptions.py b/pandas/util/_exceptions.py index 573f76a63459b..5f50838d37315 100644 --- a/pandas/util/_exceptions.py +++ b/pandas/util/_exceptions.py @@ -9,6 +9,7 @@ if TYPE_CHECKING: from collections.abc import Generator + from types import FrameType @contextlib.contextmanager @@ -42,15 +43,20 @@ def find_stack_level() -> int: test_dir = os.path.join(pkg_dir, "tests") # https://stackoverflow.com/questions/17407119/python-inspect-stack-is-slow - frame = inspect.currentframe() - n = 0 - while frame: - fname = inspect.getfile(frame) - if fname.startswith(pkg_dir) and not fname.startswith(test_dir): - frame = frame.f_back - n += 1 - else: - break + frame: FrameType | None = inspect.currentframe() + try: + n = 0 + while frame: + filename = inspect.getfile(frame) + if filename.startswith(pkg_dir) and not filename.startswith(test_dir): + frame = frame.f_back + n += 1 + else: + break + finally: + # See note in + # https://docs.python.org/3/library/inspect.html#inspect.Traceback + del frame return n From f577be25dfe97e6e4fa4192e007df2882fb3f20f Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 29 Jan 2024 04:44:40 +0100 Subject: [PATCH 13/60] Backport PR #57089 on branch 2.2.x (BUG: wide_to_long with string columns) (#57120) Backport PR #57089: BUG: wide_to_long with string columns Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/reshape/melt.py | 3 +-- pandas/core/strings/accessor.py | 4 ++-- pandas/tests/reshape/test_melt.py | 30 ++++++++++++++++++++++++++++++ 4 files changed, 34 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index b9b2821ebc468..660594c98e0f2 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -16,6 +16,7 @@ Fixed regressions - Fixed memory leak in :func:`read_csv` (:issue:`57039`) - Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`) - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) +- Fixed regression in :func:`wide_to_long` raising an ``AttributeError`` for string columns (:issue:`57066`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) - Fixed regression in :meth:`Series.pct_change` raising a ``ValueError`` for an empty :class:`Series` (:issue:`57056`) diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index bb1cd0d738dac..e54f847895f1a 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -458,8 +458,7 @@ def wide_to_long( def get_var_names(df, stub: str, sep: str, suffix: str): regex = rf"^{re.escape(stub)}{re.escape(sep)}{suffix}$" - pattern = re.compile(regex) - return df.columns[df.columns.str.match(pattern)] + return df.columns[df.columns.str.match(regex)] def melt_stub(df, stub: str, i, j, value_vars, sep: str): newdf = melt( diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 1b7d632c0fa80..da10a12d02ae4 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1336,14 +1336,14 @@ def contains( return self._wrap_result(result, fill_value=na, returns_string=False) @forbid_nonstring_types(["bytes"]) - def match(self, pat, case: bool = True, flags: int = 0, na=None): + def match(self, pat: str, case: bool = True, flags: int = 0, na=None): """ Determine if each string starts with a match of a regular expression. Parameters ---------- pat : str - Character sequence or regular expression. + Character sequence. case : bool, default True If True, case sensitive. flags : int, default 0 (no flags) diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index ff9f927597956..272c5b3403293 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -1220,3 +1220,33 @@ def test_missing_stubname(self, dtype): new_level = expected.index.levels[0].astype(dtype) expected.index = expected.index.set_levels(new_level, level=0) tm.assert_frame_equal(result, expected) + + +def test_wide_to_long_pyarrow_string_columns(): + # GH 57066 + pytest.importorskip("pyarrow") + df = DataFrame( + { + "ID": {0: 1}, + "R_test1": {0: 1}, + "R_test2": {0: 1}, + "R_test3": {0: 2}, + "D": {0: 1}, + } + ) + df.columns = df.columns.astype("string[pyarrow_numpy]") + result = wide_to_long( + df, stubnames="R", i="ID", j="UNPIVOTED", sep="_", suffix=".*" + ) + expected = DataFrame( + [[1, 1], [1, 1], [1, 2]], + columns=Index(["D", "R"], dtype=object), + index=pd.MultiIndex.from_arrays( + [ + [1, 1, 1], + Index(["test1", "test2", "test3"], dtype="string[pyarrow_numpy]"), + ], + names=["ID", "UNPIVOTED"], + ), + ) + tm.assert_frame_equal(result, expected) From df0762d18eecb567b6813cc5fd5fabc6af214aa5 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 29 Jan 2024 21:53:39 +0100 Subject: [PATCH 14/60] Backport PR #57126 on branch 2.2.x (Bump pypa/cibuildwheel from 2.16.2 to 2.16.4) (#57132) Backport PR #57126: Bump pypa/cibuildwheel from 2.16.2 to 2.16.4 Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/wheels.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 841559c8e9799..6d3b9048a2122 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -139,7 +139,7 @@ jobs: - name: Build normal wheels if: ${{ (env.IS_SCHEDULE_DISPATCH != 'true' || env.IS_PUSH == 'true') }} - uses: pypa/cibuildwheel@v2.16.2 + uses: pypa/cibuildwheel@v2.16.4 with: package-dir: ./dist/${{ matrix.buildplat[1] == 'macosx_*' && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: @@ -148,7 +148,7 @@ jobs: - name: Build nightly wheels (with NumPy pre-release) if: ${{ (env.IS_SCHEDULE_DISPATCH == 'true' && env.IS_PUSH != 'true') }} - uses: pypa/cibuildwheel@v2.16.2 + uses: pypa/cibuildwheel@v2.16.4 with: package-dir: ./dist/${{ matrix.buildplat[1] == 'macosx_*' && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: From c1723cd09e83248dd3fbc81bb43ba537082a4fe0 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 29 Jan 2024 21:53:49 +0100 Subject: [PATCH 15/60] Backport PR #57101 on branch 2.2.x (REGR: Index.join raising TypeError when joining an empty index to a mixed type index) (#57133) Backport PR #57101: REGR: Index.join raising TypeError when joining an empty index to a mixed type index Co-authored-by: Luke Manley --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/indexes/base.py | 65 +++++++++++++------------ pandas/tests/reshape/merge/test_join.py | 19 ++++++++ 3 files changed, 53 insertions(+), 32 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 660594c98e0f2..ff5fadec735e6 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -19,6 +19,7 @@ Fixed regressions - Fixed regression in :func:`wide_to_long` raising an ``AttributeError`` for string columns (:issue:`57066`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) +- Fixed regression in :meth:`Index.join` raising ``TypeError`` when joining an empty index to a non-empty index containing mixed dtype values (:issue:`57048`) - Fixed regression in :meth:`Series.pct_change` raising a ``ValueError`` for an empty :class:`Series` (:issue:`57056`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c36dcda6e2972..5e7f2e27f1275 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4615,38 +4615,12 @@ def join( if level is not None and (self._is_multi or other._is_multi): return self._join_level(other, level, how=how) - lidx: np.ndarray | None - ridx: np.ndarray | None - - if len(other) == 0: - if how in ("left", "outer"): - if sort and not self.is_monotonic_increasing: - lidx = self.argsort() - join_index = self.take(lidx) - else: - lidx = None - join_index = self._view() - ridx = np.broadcast_to(np.intp(-1), len(join_index)) - return join_index, lidx, ridx - elif how in ("right", "inner", "cross"): - join_index = other._view() - lidx = np.array([], dtype=np.intp) - return join_index, lidx, None - - if len(self) == 0: - if how in ("right", "outer"): - if sort and not other.is_monotonic_increasing: - ridx = other.argsort() - join_index = other.take(ridx) - else: - ridx = None - join_index = other._view() - lidx = np.broadcast_to(np.intp(-1), len(join_index)) - return join_index, lidx, ridx - elif how in ("left", "inner", "cross"): - join_index = self._view() - ridx = np.array([], dtype=np.intp) - return join_index, None, ridx + if len(self) == 0 or len(other) == 0: + try: + return self._join_empty(other, how, sort) + except TypeError: + # object dtype; non-comparable objects + pass if self.dtype != other.dtype: dtype = self._find_common_type_compat(other) @@ -4681,6 +4655,33 @@ def join( return self._join_via_get_indexer(other, how, sort) + @final + def _join_empty( + self, other: Index, how: JoinHow, sort: bool + ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: + assert len(self) == 0 or len(other) == 0 + _validate_join_method(how) + + lidx: np.ndarray | None + ridx: np.ndarray | None + + if len(other): + how = cast(JoinHow, {"left": "right", "right": "left"}.get(how, how)) + join_index, ridx, lidx = other._join_empty(self, how, sort) + elif how in ["left", "outer"]: + if sort and not self.is_monotonic_increasing: + lidx = self.argsort() + join_index = self.take(lidx) + else: + lidx = None + join_index = self._view() + ridx = np.broadcast_to(np.intp(-1), len(join_index)) + else: + join_index = other._view() + lidx = np.array([], dtype=np.intp) + ridx = None + return join_index, lidx, ridx + @final def _join_via_get_indexer( self, other: Index, how: JoinHow, sort: bool diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 9a2f18f33bce5..db5a0437a14f0 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -1042,6 +1042,25 @@ def test_join_empty(left_empty, how, exp): tm.assert_frame_equal(result, expected) +def test_join_empty_uncomparable_columns(): + # GH 57048 + df1 = DataFrame() + df2 = DataFrame(columns=["test"]) + df3 = DataFrame(columns=["foo", ("bar", "baz")]) + + result = df1 + df2 + expected = DataFrame(columns=["test"]) + tm.assert_frame_equal(result, expected) + + result = df2 + df3 + expected = DataFrame(columns=[("bar", "baz"), "foo", "test"]) + tm.assert_frame_equal(result, expected) + + result = df1 + df3 + expected = DataFrame(columns=[("bar", "baz"), "foo"]) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "how, values", [ From 10b58730b6a302a6c372d6d4e1b04cb87007de97 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 29 Jan 2024 23:48:32 +0100 Subject: [PATCH 16/60] Backport PR #57122 on branch 2.2.x (CI: autouse add_doctest_imports) (#57135) Backport PR #57122: CI: autouse add_doctest_imports Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- pandas/conftest.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 983272d79081e..a11d9c2b4b2a1 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -190,10 +190,6 @@ def pytest_collection_modifyitems(items, config) -> None: if is_doctest: for item in items: - # autouse=True for the add_doctest_imports can lead to expensive teardowns - # since doctest_namespace is a session fixture - item.add_marker(pytest.mark.usefixtures("add_doctest_imports")) - for path, message in ignored_doctest_warnings: ignore_doctest_warning(item, path, message) @@ -250,7 +246,14 @@ def pytest_collection_modifyitems(items, config) -> None: ) -@pytest.fixture +# ---------------------------------------------------------------- +# Autouse fixtures +# ---------------------------------------------------------------- + + +# https://github.com/pytest-dev/pytest/issues/11873 +# Would like to avoid autouse=True, but cannot as of pytest 8.0.0 +@pytest.fixture(autouse=True) def add_doctest_imports(doctest_namespace) -> None: """ Make `np` and `pd` names available for doctests. @@ -259,9 +262,6 @@ def add_doctest_imports(doctest_namespace) -> None: doctest_namespace["pd"] = pd -# ---------------------------------------------------------------- -# Autouse fixtures -# ---------------------------------------------------------------- @pytest.fixture(autouse=True) def configure_tests() -> None: """ From acd914dedba087b3642e110aadac665006da23c0 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 30 Jan 2024 05:36:42 +0100 Subject: [PATCH 17/60] Backport PR #57102 on branch 2.2.x (ENH: Add skipna to groupby.first and groupby.last) (#57141) Backport PR #57102: ENH: Add skipna to groupby.first and groupby.last Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 3 +- pandas/_libs/groupby.pyi | 2 ++ pandas/_libs/groupby.pyx | 41 ++++++++++++++-------- pandas/_testing/__init__.py | 7 ++++ pandas/conftest.py | 32 +++++++++++++++++ pandas/core/groupby/groupby.py | 36 ++++++++++++++----- pandas/core/resample.py | 10 ++++-- pandas/tests/groupby/test_reductions.py | 31 ++++++++++++++++ pandas/tests/resample/test_base.py | 29 +++++++++++++++ pandas/tests/resample/test_resample_api.py | 4 +-- 10 files changed, 167 insertions(+), 28 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index ff5fadec735e6..589903ddcca71 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -34,7 +34,8 @@ Bug fixes Other ~~~~~ -- +- Added the argument ``skipna`` to :meth:`DataFrameGroupBy.first`, :meth:`DataFrameGroupBy.last`, :meth:`SeriesGroupBy.first`, and :meth:`SeriesGroupBy.last`; achieving ``skipna=False`` used to be available via :meth:`DataFrameGroupBy.nth`, but the behavior was changed in pandas 2.0.0 (:issue:`57019`) +- Added the argument ``skipna`` to :meth:`Resampler.first`, :meth:`Resampler.last` (:issue:`57019`) .. --------------------------------------------------------------------------- .. _whatsnew_221.contributors: diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index 135828a23648a..a494b61fa7e3d 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -136,6 +136,7 @@ def group_last( result_mask: npt.NDArray[np.bool_] | None = ..., min_count: int = ..., # Py_ssize_t is_datetimelike: bool = ..., + skipna: bool = ..., ) -> None: ... def group_nth( out: np.ndarray, # rank_t[:, ::1] @@ -147,6 +148,7 @@ def group_nth( min_count: int = ..., # int64_t rank: int = ..., # int64_t is_datetimelike: bool = ..., + skipna: bool = ..., ) -> None: ... def group_rank( out: np.ndarray, # float64_t[:, ::1] diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index ac24c0bb8df88..b855d64d0be18 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1424,6 +1424,7 @@ def group_last( uint8_t[:, ::1] result_mask=None, Py_ssize_t min_count=-1, bint is_datetimelike=False, + bint skipna=True, ) -> None: """ Only aggregates on axis=0 @@ -1458,14 +1459,19 @@ def group_last( for j in range(K): val = values[i, j] - if uses_mask: - isna_entry = mask[i, j] - else: - isna_entry = _treat_as_na(val, is_datetimelike) + if skipna: + if uses_mask: + isna_entry = mask[i, j] + else: + isna_entry = _treat_as_na(val, is_datetimelike) + if isna_entry: + continue - if not isna_entry: - nobs[lab, j] += 1 - resx[lab, j] = val + nobs[lab, j] += 1 + resx[lab, j] = val + + if uses_mask and not skipna: + result_mask[lab, j] = mask[i, j] _check_below_mincount( out, uses_mask, result_mask, ncounts, K, nobs, min_count, resx @@ -1486,6 +1492,7 @@ def group_nth( int64_t min_count=-1, int64_t rank=1, bint is_datetimelike=False, + bint skipna=True, ) -> None: """ Only aggregates on axis=0 @@ -1520,15 +1527,19 @@ def group_nth( for j in range(K): val = values[i, j] - if uses_mask: - isna_entry = mask[i, j] - else: - isna_entry = _treat_as_na(val, is_datetimelike) + if skipna: + if uses_mask: + isna_entry = mask[i, j] + else: + isna_entry = _treat_as_na(val, is_datetimelike) + if isna_entry: + continue - if not isna_entry: - nobs[lab, j] += 1 - if nobs[lab, j] == rank: - resx[lab, j] = val + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val + if uses_mask and not skipna: + result_mask[lab, j] = mask[i, j] _check_below_mincount( out, uses_mask, result_mask, ncounts, K, nobs, min_count, resx diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 672c16a85086c..361998db8e38b 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -236,11 +236,18 @@ + TIMEDELTA_PYARROW_DTYPES + BOOL_PYARROW_DTYPES ) + ALL_REAL_PYARROW_DTYPES_STR_REPR = ( + ALL_INT_PYARROW_DTYPES_STR_REPR + FLOAT_PYARROW_DTYPES_STR_REPR + ) else: FLOAT_PYARROW_DTYPES_STR_REPR = [] ALL_INT_PYARROW_DTYPES_STR_REPR = [] ALL_PYARROW_DTYPES = [] + ALL_REAL_PYARROW_DTYPES_STR_REPR = [] +ALL_REAL_NULLABLE_DTYPES = ( + FLOAT_NUMPY_DTYPES + ALL_REAL_EXTENSION_DTYPES + ALL_REAL_PYARROW_DTYPES_STR_REPR +) arithmetic_dunder_methods = [ "__add__", diff --git a/pandas/conftest.py b/pandas/conftest.py index a11d9c2b4b2a1..7c35dfdde90ba 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1642,6 +1642,38 @@ def any_numpy_dtype(request): return request.param +@pytest.fixture(params=tm.ALL_REAL_NULLABLE_DTYPES) +def any_real_nullable_dtype(request): + """ + Parameterized fixture for all real dtypes that can hold NA. + + * float + * 'float32' + * 'float64' + * 'Float32' + * 'Float64' + * 'UInt8' + * 'UInt16' + * 'UInt32' + * 'UInt64' + * 'Int8' + * 'Int16' + * 'Int32' + * 'Int64' + * 'uint8[pyarrow]' + * 'uint16[pyarrow]' + * 'uint32[pyarrow]' + * 'uint64[pyarrow]' + * 'int8[pyarrow]' + * 'int16[pyarrow]' + * 'int32[pyarrow]' + * 'int64[pyarrow]' + * 'float[pyarrow]' + * 'double[pyarrow]' + """ + return request.param + + @pytest.fixture(params=tm.ALL_NUMERIC_DTYPES) def any_numeric_dtype(request): """ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 5b18455dbe8a8..db8949788567b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3335,9 +3335,13 @@ def max( ) @final - def first(self, numeric_only: bool = False, min_count: int = -1) -> NDFrameT: + def first( + self, numeric_only: bool = False, min_count: int = -1, skipna: bool = True + ) -> NDFrameT: """ - Compute the first non-null entry of each column. + Compute the first entry of each column within each group. + + Defaults to skipping NA elements. Parameters ---------- @@ -3345,12 +3349,17 @@ def first(self, numeric_only: bool = False, min_count: int = -1) -> NDFrameT: Include only float, int, boolean columns. min_count : int, default -1 The required number of valid values to perform the operation. If fewer - than ``min_count`` non-NA values are present the result will be NA. + than ``min_count`` valid values are present the result will be NA. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + + .. versionadded:: 2.2.1 Returns ------- Series or DataFrame - First non-null of values within each group. + First values within each group. See Also -------- @@ -3402,12 +3411,17 @@ def first(x: Series): min_count=min_count, alias="first", npfunc=first_compat, + skipna=skipna, ) @final - def last(self, numeric_only: bool = False, min_count: int = -1) -> NDFrameT: + def last( + self, numeric_only: bool = False, min_count: int = -1, skipna: bool = True + ) -> NDFrameT: """ - Compute the last non-null entry of each column. + Compute the last entry of each column within each group. + + Defaults to skipping NA elements. Parameters ---------- @@ -3416,12 +3430,17 @@ def last(self, numeric_only: bool = False, min_count: int = -1) -> NDFrameT: everything, then use only numeric data. min_count : int, default -1 The required number of valid values to perform the operation. If fewer - than ``min_count`` non-NA values are present the result will be NA. + than ``min_count`` valid values are present the result will be NA. + skipna : bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + + .. versionadded:: 2.2.1 Returns ------- Series or DataFrame - Last non-null of values within each group. + Last of values within each group. See Also -------- @@ -3461,6 +3480,7 @@ def last(x: Series): min_count=min_count, alias="last", npfunc=last_compat, + skipna=skipna, ) @final diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 3e9507bd4347f..2d430ef4dcff6 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1306,12 +1306,15 @@ def first( self, numeric_only: bool = False, min_count: int = 0, + skipna: bool = True, *args, **kwargs, ): maybe_warn_args_and_kwargs(type(self), "first", args, kwargs) nv.validate_resampler_func("first", args, kwargs) - return self._downsample("first", numeric_only=numeric_only, min_count=min_count) + return self._downsample( + "first", numeric_only=numeric_only, min_count=min_count, skipna=skipna + ) @final @doc(GroupBy.last) @@ -1319,12 +1322,15 @@ def last( self, numeric_only: bool = False, min_count: int = 0, + skipna: bool = True, *args, **kwargs, ): maybe_warn_args_and_kwargs(type(self), "last", args, kwargs) nv.validate_resampler_func("last", args, kwargs) - return self._downsample("last", numeric_only=numeric_only, min_count=min_count) + return self._downsample( + "last", numeric_only=numeric_only, min_count=min_count, skipna=skipna + ) @final @doc(GroupBy.median) diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 422322d03c4c0..25b0f80639cff 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -7,6 +7,9 @@ from pandas._libs.tslibs import iNaT +from pandas.core.dtypes.common import pandas_dtype +from pandas.core.dtypes.missing import na_value_for_dtype + import pandas as pd from pandas import ( DataFrame, @@ -327,6 +330,34 @@ def test_groupby_non_arithmetic_agg_int_like_precision(method, data): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("how", ["first", "last"]) +def test_first_last_skipna(any_real_nullable_dtype, sort, skipna, how): + # GH#57019 + na_value = na_value_for_dtype(pandas_dtype(any_real_nullable_dtype)) + df = DataFrame( + { + "a": [2, 1, 1, 2, 3, 3], + "b": [na_value, 3.0, na_value, 4.0, np.nan, np.nan], + "c": [na_value, 3.0, na_value, 4.0, np.nan, np.nan], + }, + dtype=any_real_nullable_dtype, + ) + gb = df.groupby("a", sort=sort) + method = getattr(gb, how) + result = method(skipna=skipna) + + ilocs = { + ("first", True): [3, 1, 4], + ("first", False): [0, 1, 4], + ("last", True): [3, 1, 5], + ("last", False): [3, 2, 5], + }[how, skipna] + expected = df.iloc[ilocs].set_index("a") + if sort: + expected = expected.sort_index() + tm.assert_frame_equal(result, expected) + + def test_idxmin_idxmax_axis1(): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), columns=["A", "B", "C", "D"] diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index 50644e33e45e1..dcf6c6099abab 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -3,6 +3,9 @@ import numpy as np import pytest +from pandas.core.dtypes.common import is_extension_array_dtype + +import pandas as pd from pandas import ( DataFrame, DatetimeIndex, @@ -429,3 +432,29 @@ def test_resample_quantile(series): result = ser.resample(freq).quantile(q) expected = ser.resample(freq).agg(lambda x: x.quantile(q)).rename(ser.name) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("how", ["first", "last"]) +def test_first_last_skipna(any_real_nullable_dtype, skipna, how): + # GH#57019 + if is_extension_array_dtype(any_real_nullable_dtype): + na_value = Series(dtype=any_real_nullable_dtype).dtype.na_value + else: + na_value = np.nan + df = DataFrame( + { + "a": [2, 1, 1, 2], + "b": [na_value, 3.0, na_value, 4.0], + "c": [na_value, 3.0, na_value, 4.0], + }, + index=date_range("2020-01-01", periods=4, freq="D"), + dtype=any_real_nullable_dtype, + ) + rs = df.resample("ME") + method = getattr(rs, how) + result = method(skipna=skipna) + + gb = df.groupby(df.shape[0] * [pd.to_datetime("2020-01-31")]) + expected = getattr(gb, how)(skipna=skipna) + expected.index.freq = "ME" + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index d3e906827b754..12abd1c98784b 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -1040,11 +1040,11 @@ def test_args_kwargs_depr(method, raises): if raises: with tm.assert_produces_warning(FutureWarning, match=warn_msg): with pytest.raises(UnsupportedFunctionCall, match=error_msg): - func(*args, 1, 2, 3) + func(*args, 1, 2, 3, 4) else: with tm.assert_produces_warning(FutureWarning, match=warn_msg): with pytest.raises(TypeError, match=error_msg_type): - func(*args, 1, 2, 3) + func(*args, 1, 2, 3, 4) def test_df_axis_param_depr(): From c0a269b354ad1b50b9a7fd56789c5137e829fcc2 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 30 Jan 2024 05:37:02 +0100 Subject: [PATCH 18/60] Backport PR #57061 on branch 2.2.x (REGR: non-unique, masked dtype index raising IndexError) (#57142) Backport PR #57061: REGR: non-unique, masked dtype index raising IndexError Co-authored-by: Luke Manley --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/_libs/index.pyx | 63 +++++++++++++++---------------- pandas/tests/indexing/test_loc.py | 12 ++++++ 3 files changed, 44 insertions(+), 32 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 589903ddcca71..6d6f0f1ee758f 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -17,6 +17,7 @@ Fixed regressions - Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`) - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) - Fixed regression in :func:`wide_to_long` raising an ``AttributeError`` for string columns (:issue:`57066`) +- Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) - Fixed regression in :meth:`Index.join` raising ``TypeError`` when joining an empty index to a non-empty index containing mixed dtype values (:issue:`57048`) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 0dc139781f58d..e4dfe9dec3623 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -96,6 +96,20 @@ cdef ndarray _get_bool_indexer(ndarray values, object val, ndarray mask = None): return indexer.view(bool) +cdef _maybe_resize_array(ndarray values, Py_ssize_t loc, Py_ssize_t max_length): + """ + Resize array if loc is out of bounds. + """ + cdef: + Py_ssize_t n = len(values) + + if loc >= n: + while loc >= n: + n *= 2 + values = np.resize(values, min(n, max_length)) + return values + + # Don't populate hash tables in monotonic indexes larger than this _SIZE_CUTOFF = 1_000_000 @@ -450,27 +464,18 @@ cdef class IndexEngine: # found if val in d: key = val - + result = _maybe_resize_array( + result, + count + len(d[key]) - 1, + max_alloc + ) for j in d[key]: - - # realloc if needed - if count >= n_alloc: - n_alloc *= 2 - if n_alloc > max_alloc: - n_alloc = max_alloc - result = np.resize(result, n_alloc) - result[count] = j count += 1 # value not found else: - - if count >= n_alloc: - n_alloc *= 2 - if n_alloc > max_alloc: - n_alloc = max_alloc - result = np.resize(result, n_alloc) + result = _maybe_resize_array(result, count, max_alloc) result[count] = -1 count += 1 missing[count_missing] = i @@ -1193,13 +1198,12 @@ cdef class MaskedIndexEngine(IndexEngine): if PySequence_GetItem(target_mask, i): if na_pos: + result = _maybe_resize_array( + result, + count + len(na_pos) - 1, + max_alloc, + ) for na_idx in na_pos: - # realloc if needed - if count >= n_alloc: - n_alloc *= 2 - if n_alloc > max_alloc: - n_alloc = max_alloc - result[count] = na_idx count += 1 continue @@ -1207,23 +1211,18 @@ cdef class MaskedIndexEngine(IndexEngine): elif val in d: # found key = val - + result = _maybe_resize_array( + result, + count + len(d[key]) - 1, + max_alloc, + ) for j in d[key]: - - # realloc if needed - if count >= n_alloc: - n_alloc *= 2 - if n_alloc > max_alloc: - n_alloc = max_alloc - result[count] = j count += 1 continue # value not found - if count >= n_alloc: - n_alloc += 10_000 - result = np.resize(result, n_alloc) + result = _maybe_resize_array(result, count, max_alloc) result[count] = -1 count += 1 missing[count_missing] = i diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 61c44c8a2a8f4..952251a58e981 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -3364,3 +3364,15 @@ def test_getitem_loc_str_periodindex(self): index = pd.period_range(start="2000", periods=20, freq="B") series = Series(range(20), index=index) assert series.loc["2000-01-14"] == 9 + + def test_loc_nonunique_masked_index(self): + # GH 57027 + ids = list(range(11)) + index = Index(ids * 1000, dtype="Int64") + df = DataFrame({"val": np.arange(len(index), dtype=np.intp)}, index=index) + result = df.loc[ids] + expected = DataFrame( + {"val": index.argsort(kind="stable").astype(np.intp)}, + index=Index(np.array(ids).repeat(1000), dtype="Int64"), + ) + tm.assert_frame_equal(result, expected) From 4bad5fcf172ac443c384da8482a3cc617da908bb Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 30 Jan 2024 05:37:17 +0100 Subject: [PATCH 19/60] Backport PR #57139 on branch 2.2.x (BUG: Index(Series) makes array read only for object dtype) (#57143) Backport PR #57139: BUG: Index(Series) makes array read only for object dtype Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 2 +- pandas/_libs/ops.pyx | 2 +- pandas/core/common.py | 2 ++ pandas/tests/indexes/base_class/test_constructors.py | 7 +++++++ pandas/tests/indexes/test_common.py | 9 +++++++++ 5 files changed, 20 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 6d6f0f1ee758f..1302648c3fc9a 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -28,7 +28,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- +- Fixed bug in :meth:`DataFrame.__getitem__` for empty :class:`DataFrame` with Copy-on-Write enabled (:issue:`57130`) .. --------------------------------------------------------------------------- .. _whatsnew_221.other: diff --git a/pandas/_libs/ops.pyx b/pandas/_libs/ops.pyx index 9154e836b3477..567bfc02a2950 100644 --- a/pandas/_libs/ops.pyx +++ b/pandas/_libs/ops.pyx @@ -29,7 +29,7 @@ from pandas._libs.util cimport is_nan @cython.wraparound(False) @cython.boundscheck(False) -def scalar_compare(object[:] values, object val, object op) -> ndarray: +def scalar_compare(ndarray[object] values, object val, object op) -> ndarray: """ Compare each element of `values` array with the scalar `val`, with the comparison operation described by `op`. diff --git a/pandas/core/common.py b/pandas/core/common.py index 7d864e02be54e..9f024498d66ed 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -233,6 +233,8 @@ def asarray_tuplesafe(values: Iterable, dtype: NpDtype | None = None) -> ArrayLi values = list(values) elif isinstance(values, ABCIndex): return values._values + elif isinstance(values, ABCSeries): + return values._values if isinstance(values, list) and dtype in [np.object_, object]: return construct_1d_object_array_from_listlike(values) diff --git a/pandas/tests/indexes/base_class/test_constructors.py b/pandas/tests/indexes/base_class/test_constructors.py index fd5176a28565e..338509dd239e6 100644 --- a/pandas/tests/indexes/base_class/test_constructors.py +++ b/pandas/tests/indexes/base_class/test_constructors.py @@ -71,3 +71,10 @@ def test_inference_on_pandas_objects(self): with tm.assert_produces_warning(FutureWarning, match="Dtype inference"): result = Index(ser) assert result.dtype != np.object_ + + def test_constructor_not_read_only(self): + # GH#57130 + ser = Series([1, 2], dtype=object) + with pd.option_context("mode.copy_on_write", True): + idx = Index(ser) + assert idx._values.flags.writeable diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 412a59d15307d..80c39322b9b81 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -500,3 +500,12 @@ def test_ndarray_compat_properties(index): # test for validity idx.nbytes idx.values.nbytes + + +def test_compare_read_only_array(): + # GH#57130 + arr = np.array([], dtype=object) + arr.flags.writeable = False + idx = pd.Index(arr) + result = idx > 69 + assert result.dtype == bool From 27cea3a8d338e2ebaa809e9079de19fcb3991cee Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 30 Jan 2024 18:46:41 +0100 Subject: [PATCH 20/60] Backport PR #57144 on branch 2.2.x (CI: Fix _get_dst_hours for numpy 2.0 change) (#57153) Backport PR #57144: CI: Fix _get_dst_hours for numpy 2.0 change Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/_libs/tslibs/tzconversion.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index 2c4f0cd14db13..e3facd3d9599b 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -607,7 +607,8 @@ cdef ndarray[int64_t] _get_dst_hours( ndarray[uint8_t, cast=True] mismatch ndarray[int64_t] delta, dst_hours ndarray[intp_t] switch_idxs, trans_idx, grp, a_idx, b_idx, one_diff - list trans_grp + # TODO: Can uncomment when numpy >=2 is the minimum + # tuple trans_grp intp_t switch_idx int64_t left, right From f6fd475680761f12dbc4abe3db4a26e55b68fd90 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 31 Jan 2024 02:52:25 +0100 Subject: [PATCH 21/60] Backport PR #57157 on branch 2.2.x (BUG: Fix to_dict with datelike types and orient=list) (#57160) Backport PR #57157: BUG: Fix to_dict with datelike types and orient=list Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/methods/to_dict.py | 8 ++------ pandas/tests/frame/methods/test_to_dict.py | 14 ++++++++++++++ 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 1302648c3fc9a..19b7e3493f964 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -18,6 +18,7 @@ Fixed regressions - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) - Fixed regression in :func:`wide_to_long` raising an ``AttributeError`` for string columns (:issue:`57066`) - Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`) +- Fixed regression in :meth:`DataFrame.to_dict` with ``orient='list'`` and datetime or timedelta types returning integers (:issue:`54824`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) - Fixed regression in :meth:`Index.join` raising ``TypeError`` when joining an empty index to a non-empty index containing mixed dtype values (:issue:`57048`) diff --git a/pandas/core/methods/to_dict.py b/pandas/core/methods/to_dict.py index 7bd4851425c3b..accbd92a91ed6 100644 --- a/pandas/core/methods/to_dict.py +++ b/pandas/core/methods/to_dict.py @@ -171,13 +171,9 @@ def to_dict( return into_c( ( k, - list( - map( - maybe_box_native, v.to_numpy(na_value=box_na_values[i]).tolist() - ) - ) + list(map(maybe_box_native, v.to_numpy(na_value=box_na_values[i]))) if i in object_dtype_indices_as_set - else v.to_numpy().tolist(), + else list(map(maybe_box_native, v.to_numpy())), ) for i, (k, v) in enumerate(df.items()) ) diff --git a/pandas/tests/frame/methods/test_to_dict.py b/pandas/tests/frame/methods/test_to_dict.py index 61f0ad30b4519..570f85a4a31ee 100644 --- a/pandas/tests/frame/methods/test_to_dict.py +++ b/pandas/tests/frame/methods/test_to_dict.py @@ -12,8 +12,11 @@ NA, DataFrame, Index, + Interval, MultiIndex, + Period, Series, + Timedelta, Timestamp, ) import pandas._testing as tm @@ -519,3 +522,14 @@ def test_to_dict_pos_args_deprecation(self): ) with tm.assert_produces_warning(FutureWarning, match=msg): df.to_dict("records", {}) + + +@pytest.mark.parametrize( + "val", [Timestamp(2020, 1, 1), Timedelta(1), Period("2020"), Interval(1, 2)] +) +def test_to_dict_list_pd_scalars(val): + # GH 54824 + df = DataFrame({"a": [val]}) + result = df.to_dict(orient="list") + expected = {"a": [val]} + assert result == expected From 59e6c80a47dd5bf0799e622eae7b3a0c5864309f Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 31 Jan 2024 20:06:35 +0100 Subject: [PATCH 22/60] Backport PR #57175 on branch 2.2.x (BUG: Interchange protocol implementation handles empty dataframes incorrectly) (#57179) Backport PR #57175: BUG: Interchange protocol implementation handles empty dataframes incorrectly Co-authored-by: Marco Edward Gorelli --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/interchange/buffer.py | 2 +- pandas/tests/interchange/test_impl.py | 9 +++++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 19b7e3493f964..4a20eda08937a 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -29,6 +29,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ +- Fixed bug in :func:`pandas.api.interchange.from_dataframe` which was raising for empty inputs (:issue:`56700`) - Fixed bug in :meth:`DataFrame.__getitem__` for empty :class:`DataFrame` with Copy-on-Write enabled (:issue:`57130`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/interchange/buffer.py b/pandas/core/interchange/buffer.py index a54e4428bd836..5c97fc17d7070 100644 --- a/pandas/core/interchange/buffer.py +++ b/pandas/core/interchange/buffer.py @@ -23,7 +23,7 @@ def __init__(self, x: np.ndarray, allow_copy: bool = True) -> None: """ Handle only regular columns (= numpy arrays) for now. """ - if not x.strides == (x.dtype.itemsize,): + if x.strides[0] and not x.strides == (x.dtype.itemsize,): # The protocol does not support strided buffers, so a copy is # necessary. If that's not allowed, we need to raise an exception. if allow_copy: diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index c7b13f9fd7b2d..c6365233728d2 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -379,3 +379,12 @@ def test_large_string(): result = pd.api.interchange.from_dataframe(df.__dataframe__()) expected = pd.DataFrame({"a": ["x"]}, dtype="object") tm.assert_frame_equal(result, expected) + + +def test_empty_dataframe(): + # https://github.com/pandas-dev/pandas/issues/56700 + df = pd.DataFrame({"a": []}, dtype="int8") + dfi = df.__dataframe__() + result = pd.api.interchange.from_dataframe(dfi, allow_copy=False) + expected = pd.DataFrame({"a": []}, dtype="int8") + tm.assert_frame_equal(result, expected) From bc09d5765197c39daae8326fc9ee44bb2c3153c4 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 31 Jan 2024 20:20:42 +0100 Subject: [PATCH 23/60] Backport PR #57169 on branch 2.2.x (REGR: DataFrame.sort_index not producing stable sort) (#57180) Backport PR #57169: REGR: DataFrame.sort_index not producing stable sort Co-authored-by: Luke Manley --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/indexes/base.py | 11 ++++----- pandas/tests/frame/methods/test_sort_index.py | 24 +++++++++++++++++++ 3 files changed, 29 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 4a20eda08937a..9002d9af2c602 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -18,6 +18,7 @@ Fixed regressions - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) - Fixed regression in :func:`wide_to_long` raising an ``AttributeError`` for string columns (:issue:`57066`) - Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`) +- Fixed regression in :meth:`DataFrame.sort_index` not producing a stable sort for a index with duplicates (:issue:`57151`) - Fixed regression in :meth:`DataFrame.to_dict` with ``orient='list'`` and datetime or timedelta types returning integers (:issue:`54824`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 5e7f2e27f1275..6f9078bf5a2a2 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5914,17 +5914,14 @@ def sort_values( (Index([1000, 100, 10, 1], dtype='int64'), array([3, 1, 0, 2])) """ if key is None and ( - self.is_monotonic_increasing or self.is_monotonic_decreasing + (ascending and self.is_monotonic_increasing) + or (not ascending and self.is_monotonic_decreasing) ): - reverse = ascending != self.is_monotonic_increasing - sorted_index = self[::-1] if reverse else self.copy() if return_indexer: indexer = np.arange(len(self), dtype=np.intp) - if reverse: - indexer = indexer[::-1] - return sorted_index, indexer + return self.copy(), indexer else: - return sorted_index + return self.copy() # GH 35584. Sort missing values according to na_position kwarg # ignore na_position for MultiIndex diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py index 49e292057e4dc..830561a1349ee 100644 --- a/pandas/tests/frame/methods/test_sort_index.py +++ b/pandas/tests/frame/methods/test_sort_index.py @@ -1002,3 +1002,27 @@ def test_axis_columns_ignore_index(): result = df.sort_index(axis="columns", ignore_index=True) expected = DataFrame([[2, 1]]) tm.assert_frame_equal(result, expected) + + +def test_sort_index_stable_sort(): + # GH 57151 + df = DataFrame( + data=[ + (Timestamp("2024-01-30 13:00:00"), 13.0), + (Timestamp("2024-01-30 13:00:00"), 13.1), + (Timestamp("2024-01-30 12:00:00"), 12.0), + (Timestamp("2024-01-30 12:00:00"), 12.1), + ], + columns=["dt", "value"], + ).set_index(["dt"]) + result = df.sort_index(level="dt", kind="stable") + expected = DataFrame( + data=[ + (Timestamp("2024-01-30 12:00:00"), 12.0), + (Timestamp("2024-01-30 12:00:00"), 12.1), + (Timestamp("2024-01-30 13:00:00"), 13.0), + (Timestamp("2024-01-30 13:00:00"), 13.1), + ], + columns=["dt", "value"], + ).set_index(["dt"]) + tm.assert_frame_equal(result, expected) From 62aea0f06dc8bb7b029f551ef2bcfb8b08ee8ddf Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Thu, 1 Feb 2024 21:15:39 +0000 Subject: [PATCH 24/60] =?UTF-8?q?Backport=20PR=20#57173:=20BUG:=20pandas?= =?UTF-8?q?=20int=20extension=20dtypes=20has=20no=20attribute=E2=80=A6=20(?= =?UTF-8?q?#57198)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Backport PR #57173: BUG: pandas int extension dtypes has no attribute byteorder --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/interchange/column.py | 3 +++ pandas/tests/interchange/test_impl.py | 12 ++++++++++++ 3 files changed, 16 insertions(+) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 9002d9af2c602..56e645d4c55db 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -30,6 +30,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ +- Fixed bug in :func:`pandas.api.interchange.from_dataframe` which was raising for Nullable integers (:issue:`55069`) - Fixed bug in :func:`pandas.api.interchange.from_dataframe` which was raising for empty inputs (:issue:`56700`) - Fixed bug in :meth:`DataFrame.__getitem__` for empty :class:`DataFrame` with Copy-on-Write enabled (:issue:`57130`) diff --git a/pandas/core/interchange/column.py b/pandas/core/interchange/column.py index ee1b5cd34a7f7..96757072dd854 100644 --- a/pandas/core/interchange/column.py +++ b/pandas/core/interchange/column.py @@ -11,6 +11,7 @@ from pandas.core.dtypes.dtypes import ( ArrowDtype, + BaseMaskedDtype, DatetimeTZDtype, ) @@ -143,6 +144,8 @@ def _dtype_from_pandasdtype(self, dtype) -> tuple[DtypeKind, int, str, str]: byteorder = dtype.numpy_dtype.byteorder elif isinstance(dtype, DatetimeTZDtype): byteorder = dtype.base.byteorder # type: ignore[union-attr] + elif isinstance(dtype, BaseMaskedDtype): + byteorder = dtype.numpy_dtype.byteorder else: byteorder = dtype.byteorder diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index c6365233728d2..209944427d5dc 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -9,6 +9,7 @@ is_platform_windows, ) from pandas.compat.numpy import np_version_lt1p23 +import pandas.util._test_decorators as td import pandas as pd import pandas._testing as tm @@ -381,6 +382,17 @@ def test_large_string(): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize( + "dtype", ["Int8", pytest.param("Int8[pyarrow]", marks=td.skip_if_no("pyarrow"))] +) +def test_nullable_integers(dtype: str) -> None: + # https://github.com/pandas-dev/pandas/issues/55069 + df = pd.DataFrame({"a": [1]}, dtype=dtype) + expected = pd.DataFrame({"a": [1]}, dtype="int8") + result = pd.api.interchange.from_dataframe(df.__dataframe__()) + tm.assert_frame_equal(result, expected) + + def test_empty_dataframe(): # https://github.com/pandas-dev/pandas/issues/56700 df = pd.DataFrame({"a": []}, dtype="int8") From be8f9f267473133f5436cef564bd13e2872f9bec Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 2 Feb 2024 03:37:48 +0100 Subject: [PATCH 25/60] Backport PR #57163 on branch 2.2.x (CI: Add macOS M1 CI) (#57202) Backport PR #57163: CI: Add macOS M1 CI Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- .github/workflows/unit-tests.yml | 9 +++++---- .github/workflows/wheels.yml | 18 ++++++++++-------- ci/deps/actions-310.yaml | 2 +- ci/deps/actions-311.yaml | 3 +-- ci/deps/actions-312.yaml | 2 +- ci/deps/actions-39.yaml | 2 +- pyproject.toml | 4 ---- 7 files changed, 19 insertions(+), 21 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index a3cffb4b03b93..4c7aa1e1e49ee 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -214,7 +214,8 @@ jobs: timeout-minutes: 90 strategy: matrix: - os: [macos-latest, windows-latest] + # Note: Don't use macOS latest since macos 14 appears to be arm64 only + os: [macos-13, macos-14, windows-latest] env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml, actions-312.yaml] fail-fast: false runs-on: ${{ matrix.os }} @@ -227,8 +228,7 @@ jobs: PANDAS_CI: 1 PYTEST_TARGET: pandas PATTERN: "not slow and not db and not network and not single_cpu" - # GH 47443: PYTEST_WORKERS > 0 crashes Windows builds with memory related errors - PYTEST_WORKERS: ${{ matrix.os == 'macos-latest' && 'auto' || '0' }} + PYTEST_WORKERS: 'auto' steps: - name: Checkout @@ -354,7 +354,8 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-22.04, macOS-latest, windows-latest] + # Separate out macOS 13 and 14, since macOS 14 is arm64 only + os: [ubuntu-22.04, macOS-13, macOS-14, windows-latest] timeout-minutes: 90 diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 6d3b9048a2122..f79b2c51b5f92 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -94,7 +94,9 @@ jobs: buildplat: - [ubuntu-22.04, manylinux_x86_64] - [ubuntu-22.04, musllinux_x86_64] - - [macos-12, macosx_*] + - [macos-12, macosx_x86_64] + # Note: M1 images on Github Actions start from macOS 14 + - [macos-14, macosx_arm64] - [windows-2022, win_amd64] # TODO: support PyPy? python: [["cp39", "3.9"], ["cp310", "3.10"], ["cp311", "3.11"], ["cp312", "3.12"]] @@ -128,7 +130,7 @@ jobs: # Python version used to build sdist doesn't matter # wheel will be built from sdist with the correct version - name: Unzip sdist (macOS) - if: ${{ matrix.buildplat[1] == 'macosx_*' }} + if: ${{ startsWith(matrix.buildplat[1], 'macosx') }} run: | tar -xzf ./dist/${{ env.sdist_name }} -C ./dist @@ -139,18 +141,18 @@ jobs: - name: Build normal wheels if: ${{ (env.IS_SCHEDULE_DISPATCH != 'true' || env.IS_PUSH == 'true') }} - uses: pypa/cibuildwheel@v2.16.4 + uses: pypa/cibuildwheel@v2.16.5 with: - package-dir: ./dist/${{ matrix.buildplat[1] == 'macosx_*' && env.sdist_name || needs.build_sdist.outputs.sdist_file }} + package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: CIBW_PRERELEASE_PYTHONS: True CIBW_BUILD: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }} - name: Build nightly wheels (with NumPy pre-release) if: ${{ (env.IS_SCHEDULE_DISPATCH == 'true' && env.IS_PUSH != 'true') }} - uses: pypa/cibuildwheel@v2.16.4 + uses: pypa/cibuildwheel@v2.16.5 with: - package-dir: ./dist/${{ matrix.buildplat[1] == 'macosx_*' && env.sdist_name || needs.build_sdist.outputs.sdist_file }} + package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: # The nightly wheels should be build witht he NumPy 2.0 pre-releases # which requires the additional URL. @@ -183,7 +185,7 @@ jobs: $TST_CMD = @" python -m pip install hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0; python -m pip install `$(Get-Item pandas\wheelhouse\*.whl); - python -c `'import pandas as pd; pd.test(extra_args=[\"`\"--no-strict-data-files`\"\", \"`\"-m not clipboard and not single_cpu and not slow and not network and not db`\"\"])`'; + python -c `'import pandas as pd; pd.test(extra_args=[`\"--no-strict-data-files`\", `\"-m not clipboard and not single_cpu and not slow and not network and not db`\"])`'; "@ # add rc to the end of the image name if the Python version is unreleased docker pull python:${{ matrix.python[1] == '3.12' && '3.12-rc' || format('{0}-windowsservercore', matrix.python[1]) }} @@ -191,7 +193,7 @@ jobs: - uses: actions/upload-artifact@v4 with: - name: ${{ matrix.python[0] }}-${{ startsWith(matrix.buildplat[1], 'macosx') && 'macosx' || matrix.buildplat[1] }} + name: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }} path: ./wheelhouse/*.whl - name: Upload wheels & sdist diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 45f114322015b..a3e44e6373145 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -14,7 +14,6 @@ dependencies: - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-localserver>=0.7.1 - pytest-qt>=4.2.0 - boto3 @@ -61,3 +60,4 @@ dependencies: - adbc-driver-postgresql>=0.8.0 - adbc-driver-sqlite>=0.8.0 - tzdata>=2022.7 + - pytest-localserver>=0.7.1 diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index d14686696e669..95cd1a4d46ef4 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -14,7 +14,6 @@ dependencies: - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-localserver>=0.7.1 - pytest-qt>=4.2.0 - boto3 @@ -60,4 +59,4 @@ dependencies: - pip: - adbc-driver-postgresql>=0.8.0 - adbc-driver-sqlite>=0.8.0 - - tzdata>=2022.7 + - pytest-localserver>=0.7.1 diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml index 86aaf24b4e15c..a442ed6feeb5d 100644 --- a/ci/deps/actions-312.yaml +++ b/ci/deps/actions-312.yaml @@ -14,7 +14,6 @@ dependencies: - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-localserver>=0.7.1 - pytest-qt>=4.2.0 - boto3 @@ -61,3 +60,4 @@ dependencies: - adbc-driver-postgresql>=0.8.0 - adbc-driver-sqlite>=0.8.0 - tzdata>=2022.7 + - pytest-localserver>=0.7.1 diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index 31ee74174cd46..b162a78e7f115 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -14,7 +14,6 @@ dependencies: - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-localserver>=0.7.1 - pytest-qt>=4.2.0 - boto3 @@ -61,3 +60,4 @@ dependencies: - adbc-driver-postgresql>=0.8.0 - adbc-driver-sqlite>=0.8.0 - tzdata>=2022.7 + - pytest-localserver>=0.7.1 diff --git a/pyproject.toml b/pyproject.toml index 2f70ade7b3afe..8c9a79aa2b059 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -162,10 +162,6 @@ test-command = """ pd.test(extra_args=["-m not clipboard and single_cpu and not slow and not network and not db", "--no-strict-data-files"]);' \ """ -[tool.cibuildwheel.macos] -archs = "x86_64 arm64" -test-skip = "*_arm64" - [tool.cibuildwheel.windows] before-build = "pip install delvewheel" repair-wheel-command = "delvewheel repair -w {dest_dir} {wheel}" From 5034b780e90f499d4da8dc2e461aca19304c5263 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 2 Feb 2024 03:38:20 +0100 Subject: [PATCH 26/60] Backport PR #57174 on branch 2.2.x (BUG: Interchange protocol implementation allows non-string column names) (#57203) Backport PR #57174: BUG: Interchange protocol implementation allows non-string column names Co-authored-by: Marco Edward Gorelli --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/interchange/column.py | 8 ++++++++ pandas/core/interchange/dataframe.py | 2 +- pandas/tests/interchange/test_impl.py | 26 ++++++++++++++++++++++++-- 4 files changed, 34 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 56e645d4c55db..13d5024b5a131 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -32,6 +32,7 @@ Bug fixes ~~~~~~~~~ - Fixed bug in :func:`pandas.api.interchange.from_dataframe` which was raising for Nullable integers (:issue:`55069`) - Fixed bug in :func:`pandas.api.interchange.from_dataframe` which was raising for empty inputs (:issue:`56700`) +- Fixed bug in :func:`pandas.api.interchange.from_dataframe` which wasn't converting columns names to strings (:issue:`55069`) - Fixed bug in :meth:`DataFrame.__getitem__` for empty :class:`DataFrame` with Copy-on-Write enabled (:issue:`57130`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/interchange/column.py b/pandas/core/interchange/column.py index 96757072dd854..e273ecad8b51e 100644 --- a/pandas/core/interchange/column.py +++ b/pandas/core/interchange/column.py @@ -77,6 +77,14 @@ def __init__(self, column: pd.Series, allow_copy: bool = True) -> None: Note: doesn't deal with extension arrays yet, just assume a regular Series/ndarray for now. """ + if isinstance(column, pd.DataFrame): + raise TypeError( + "Expected a Series, got a DataFrame. This likely happened " + "because you called __dataframe__ on a DataFrame which, " + "after converting column names to string, resulted in duplicated " + f"names: {column.columns}. Please rename these columns before " + "using the interchange protocol." + ) if not isinstance(column, pd.Series): raise NotImplementedError(f"Columns of type {type(column)} not handled yet") diff --git a/pandas/core/interchange/dataframe.py b/pandas/core/interchange/dataframe.py index 4f08b2c2b3a7b..1ffe0e8e8dbb0 100644 --- a/pandas/core/interchange/dataframe.py +++ b/pandas/core/interchange/dataframe.py @@ -32,7 +32,7 @@ def __init__(self, df: DataFrame, allow_copy: bool = True) -> None: Constructor - an instance of this (private) class is returned from `pd.DataFrame.__dataframe__`. """ - self._df = df + self._df = df.rename(columns=str, copy=False) self._allow_copy = allow_copy def __dataframe__( diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 209944427d5dc..d47b533f92235 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -180,8 +180,6 @@ def test_missing_from_masked(): } ) - df2 = df.__dataframe__() - rng = np.random.default_rng(2) dict_null = {col: rng.integers(low=0, high=len(df)) for col in df.columns} for col, num_nulls in dict_null.items(): @@ -382,6 +380,30 @@ def test_large_string(): tm.assert_frame_equal(result, expected) +def test_non_str_names(): + # https://github.com/pandas-dev/pandas/issues/56701 + df = pd.Series([1, 2, 3], name=0).to_frame() + names = df.__dataframe__().column_names() + assert names == ["0"] + + +def test_non_str_names_w_duplicates(): + # https://github.com/pandas-dev/pandas/issues/56701 + df = pd.DataFrame({"0": [1, 2, 3], 0: [4, 5, 6]}) + dfi = df.__dataframe__() + with pytest.raises( + TypeError, + match=( + "Expected a Series, got a DataFrame. This likely happened because you " + "called __dataframe__ on a DataFrame which, after converting column " + r"names to string, resulted in duplicated names: Index\(\['0', '0'\], " + r"dtype='object'\). Please rename these columns before using the " + "interchange protocol." + ), + ): + pd.api.interchange.from_dataframe(dfi, allow_copy=False) + + @pytest.mark.parametrize( "dtype", ["Int8", pytest.param("Int8[pyarrow]", marks=td.skip_if_no("pyarrow"))] ) From e54e0e2a63af60e88fad5d5a592bdf957521f3a9 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 4 Feb 2024 04:08:14 +0100 Subject: [PATCH 27/60] Backport PR #57232 on branch 2.2.x (REGR: to_json converting nullable ints to floats) (#57240) Backport PR #57232: REGR: to_json converting nullable ints to floats Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/arrays/arrow/array.py | 5 +++++ pandas/core/arrays/masked.py | 3 +++ pandas/tests/io/json/test_pandas.py | 16 ++++++++++++++++ 4 files changed, 25 insertions(+) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 13d5024b5a131..3cc11974b14e5 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -20,6 +20,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`) - Fixed regression in :meth:`DataFrame.sort_index` not producing a stable sort for a index with duplicates (:issue:`57151`) - Fixed regression in :meth:`DataFrame.to_dict` with ``orient='list'`` and datetime or timedelta types returning integers (:issue:`54824`) +- Fixed regression in :meth:`DataFrame.to_json` converting nullable integers to floats (:issue:`57224`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) - Fixed regression in :meth:`Index.join` raising ``TypeError`` when joining an empty index to a non-empty index containing mixed dtype values (:issue:`57048`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index a5ce46ed612f3..621a32a049f3b 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1348,6 +1348,11 @@ def _to_timedeltaarray(self) -> TimedeltaArray: np_array = np_array.astype(np_dtype) return TimedeltaArray._simple_new(np_array, dtype=np_dtype) + def _values_for_json(self) -> np.ndarray: + if is_numeric_dtype(self.dtype): + return np.asarray(self, dtype=object) + return super()._values_for_json() + @doc(ExtensionArray.to_numpy) def to_numpy( self, diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 234d96e53a67c..8c41de9c9fffa 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -430,6 +430,9 @@ def __abs__(self) -> Self: # ------------------------------------------------------------------ + def _values_for_json(self) -> np.ndarray: + return np.asarray(self, dtype=object) + def to_numpy( self, dtype: npt.DTypeLike | None = None, diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 1da27ad173235..caa25841d3596 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -2172,3 +2172,19 @@ def test_json_pos_args_deprecation(): with tm.assert_produces_warning(FutureWarning, match=msg): buf = BytesIO() df.to_json(buf, "split") + + +@td.skip_if_no("pyarrow") +def test_to_json_ea_null(): + # GH#57224 + df = DataFrame( + { + "a": Series([1, NA], dtype="int64[pyarrow]"), + "b": Series([2, NA], dtype="Int64"), + } + ) + result = df.to_json(orient="records", lines=True) + expected = """{"a":1,"b":2} +{"a":null,"b":null} +""" + assert result == expected From c1b17ae8dcebb7faaa90ea520a7e407321daa594 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 6 Feb 2024 03:28:16 +0100 Subject: [PATCH 28/60] Backport PR #57265 on branch 2.2.x (COMPAT: Numpy 2.0 casting compat) (#57271) Backport PR #57265: COMPAT: Numpy 2.0 casting compat Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/core/dtypes/cast.py | 1 + pandas/core/internals/blocks.py | 9 ++++++++- pandas/tests/indexing/test_loc.py | 16 ++-------------- pandas/tests/series/test_constructors.py | 12 +++++++++--- 4 files changed, 20 insertions(+), 18 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 259e83a5936d7..690af6b0ebdc7 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1682,6 +1682,7 @@ def maybe_cast_to_integer_array(arr: list | np.ndarray, dtype: np.dtype) -> np.n arr = np.asarray(arr) if np.issubdtype(arr.dtype, str): + # TODO(numpy-2.0 min): This case will raise an OverflowError above if (casted.astype(str) == arr).all(): return casted raise ValueError(f"string values cannot be losslessly cast to {dtype}") diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 70a27300bd60f..259e969112dd7 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1425,7 +1425,14 @@ def setitem(self, indexer, value, using_cow: bool = False) -> Block: if isinstance(casted, np.ndarray) and casted.ndim == 1 and len(casted) == 1: # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615 casted = casted[0, ...] - values[indexer] = casted + try: + values[indexer] = casted + except (TypeError, ValueError) as err: + if is_list_like(casted): + raise ValueError( + "setting an array element with a sequence." + ) from err + raise return self def putmask( diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 952251a58e981..0cd1390d41461 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1235,13 +1235,7 @@ def test_loc_setitem_empty_append_raises(self): with pytest.raises(KeyError, match=msg): df.loc[[0, 1], "x"] = data - msg = "|".join( - [ - "cannot copy sequence with size 2 to array axis with dimension 0", - r"could not broadcast input array from shape \(2,\) into shape \(0,\)", - "Must have equal len keys and value when setting with an iterable", - ] - ) + msg = "setting an array element with a sequence." with pytest.raises(ValueError, match=msg): df.loc[0:2, "x"] = data @@ -1575,16 +1569,10 @@ def test_loc_setitem_2d_to_1d_raises(self): # float64 dtype to avoid upcast when trying to set float data ser = Series(range(2), dtype="float64") - msg = "|".join( - [ - r"shape mismatch: value array of shape \(2,2\)", - r"cannot reshape array of size 4 into shape \(2,\)", - ] - ) + msg = "setting an array element with a sequence." with pytest.raises(ValueError, match=msg): ser.loc[range(2)] = data - msg = r"could not broadcast input array from shape \(2,2\) into shape \(2,?\)" with pytest.raises(ValueError, match=msg): ser.loc[:] = data diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index da069afe5e709..4d3839553a0af 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -1958,9 +1958,15 @@ def test_constructor_int64_dtype(self, any_int_dtype): def test_constructor_raise_on_lossy_conversion_of_strings(self): # GH#44923 - with pytest.raises( - ValueError, match="string values cannot be losslessly cast to int8" - ): + if not np_version_gt2: + raises = pytest.raises( + ValueError, match="string values cannot be losslessly cast to int8" + ) + else: + raises = pytest.raises( + OverflowError, match="The elements provided in the data" + ) + with raises: Series(["128"], dtype="int8") def test_constructor_dtype_timedelta_alternative_construct(self): From 45fc954839e79605b14af4b771da56b12b7283cb Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Wed, 7 Feb 2024 15:12:00 +0000 Subject: [PATCH 29/60] Backport PR #56945 on branch 2.2.x (ENH: raise ValueError if invalid period freq pass to asfreq when the index of df is a PeriodIndex) (#57292) 'Backport PR #56945: ENH: raise ValueError if invalid period freq pass to asfreq when the index of df is a PeriodIndex' (cherry picked from commit cb97ce6e496e7d1df76f06550c0fe8c4590ff3ce) Co-authored-by: Natalia Mokeeva <91160475+natmokval@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/_libs/tslibs/offsets.pyx | 32 +++++++----- pandas/core/arrays/period.py | 11 ++-- pandas/plotting/_matplotlib/timeseries.py | 5 +- pandas/tests/dtypes/test_dtypes.py | 10 ++-- .../datetimes/methods/test_to_period.py | 2 +- .../indexes/period/methods/test_asfreq.py | 51 +++++++++++++++++++ pandas/tests/resample/test_period_index.py | 4 +- pandas/tests/scalar/period/test_asfreq.py | 5 +- pandas/tests/scalar/period/test_period.py | 16 +++--- 10 files changed, 96 insertions(+), 41 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 3cc11974b14e5..3f70c72a55a4c 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -35,6 +35,7 @@ Bug fixes - Fixed bug in :func:`pandas.api.interchange.from_dataframe` which was raising for empty inputs (:issue:`56700`) - Fixed bug in :func:`pandas.api.interchange.from_dataframe` which wasn't converting columns names to strings (:issue:`55069`) - Fixed bug in :meth:`DataFrame.__getitem__` for empty :class:`DataFrame` with Copy-on-Write enabled (:issue:`57130`) +- Fixed bug in :meth:`PeriodIndex.asfreq` which was silently converting frequencies which are not supported as period frequencies instead of raising an error (:issue:`56945`) .. --------------------------------------------------------------------------- .. _whatsnew_221.other: diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 764a044f32c82..70e1ca1c4012a 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -4271,9 +4271,7 @@ cdef class CustomBusinessDay(BusinessDay): @property def _period_dtype_code(self): # GH#52534 - raise TypeError( - "CustomBusinessDay is not supported as period frequency" - ) + raise ValueError(f"{self.base} is not supported as period frequency") _apply_array = BaseOffset._apply_array @@ -4822,19 +4820,19 @@ cpdef to_offset(freq, bint is_period=False): if freq is None: return None - if isinstance(freq, BaseOffset): - return freq - if isinstance(freq, tuple): raise TypeError( f"to_offset does not support tuples {freq}, pass as a string instead" ) + if isinstance(freq, BaseOffset): + result = freq + elif PyDelta_Check(freq): - return delta_to_tick(freq) + result = delta_to_tick(freq) elif isinstance(freq, str): - delta = None + result = None stride_sign = None try: @@ -4935,21 +4933,27 @@ cpdef to_offset(freq, bint is_period=False): offset = _get_offset(prefix) offset = offset * int(np.fabs(stride) * stride_sign) - if delta is None: - delta = offset + if result is None: + result = offset else: - delta = delta + offset + result = result + offset except (ValueError, TypeError) as err: raise ValueError(INVALID_FREQ_ERR_MSG.format( f"{freq}, failed to parse with error message: {repr(err)}") ) else: - delta = None + result = None - if delta is None: + if result is None: raise ValueError(INVALID_FREQ_ERR_MSG.format(freq)) - return delta + if is_period and not hasattr(result, "_period_dtype_code"): + if isinstance(freq, str): + raise ValueError(f"{result.name} is not supported as period frequency") + else: + raise ValueError(f"{freq} is not supported as period frequency") + + return result # ---------------------------------------------------------------------- diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 28f25d38b2363..d635eb4a25df3 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -733,8 +733,8 @@ def asfreq(self, freq=None, how: str = "E") -> Self: '2015-01'], dtype='period[M]') """ how = libperiod.validate_end_alias(how) - if isinstance(freq, BaseOffset): - freq = freq_to_period_freqstr(freq.n, freq.name) + if isinstance(freq, BaseOffset) and hasattr(freq, "_period_dtype_code"): + freq = PeriodDtype(freq)._freqstr freq = Period._maybe_convert_freq(freq) base1 = self._dtype._dtype_code @@ -1186,12 +1186,7 @@ def dt64arr_to_periodarr( reso = get_unit_from_dtype(data.dtype) freq = Period._maybe_convert_freq(freq) - try: - base = freq._period_dtype_code - except (AttributeError, TypeError): - # AttributeError: _period_dtype_code might not exist - # TypeError: _period_dtype_code might intentionally raise - raise TypeError(f"{freq.name} is not supported as period frequency") + base = freq._period_dtype_code return c_dt64arr_to_periodarr(data.view("i8"), base, tz, reso=reso), freq diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py index bf1c0f6346f02..c7ddfa55d0417 100644 --- a/pandas/plotting/_matplotlib/timeseries.py +++ b/pandas/plotting/_matplotlib/timeseries.py @@ -205,7 +205,10 @@ def _get_ax_freq(ax: Axes): def _get_period_alias(freq: timedelta | BaseOffset | str) -> str | None: - freqstr = to_offset(freq, is_period=True).rule_code + if isinstance(freq, BaseOffset): + freqstr = freq.name + else: + freqstr = to_offset(freq, is_period=True).rule_code return get_period_alias(freqstr) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 0dad0b05303ad..de1ddce724a5b 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -445,12 +445,12 @@ def test_construction(self): def test_cannot_use_custom_businessday(self): # GH#52534 - msg = "CustomBusinessDay is not supported as period frequency" + msg = "C is not supported as period frequency" + msg1 = " is not supported as period frequency" msg2 = r"PeriodDtype\[B\] is deprecated" - with pytest.raises(TypeError, match=msg): - with tm.assert_produces_warning(FutureWarning, match=msg2): - PeriodDtype("C") - with pytest.raises(TypeError, match=msg): + with pytest.raises(ValueError, match=msg): + PeriodDtype("C") + with pytest.raises(ValueError, match=msg1): with tm.assert_produces_warning(FutureWarning, match=msg2): PeriodDtype(pd.offsets.CustomBusinessDay()) diff --git a/pandas/tests/indexes/datetimes/methods/test_to_period.py b/pandas/tests/indexes/datetimes/methods/test_to_period.py index 00c0216a9b3b5..de8d32f64cde2 100644 --- a/pandas/tests/indexes/datetimes/methods/test_to_period.py +++ b/pandas/tests/indexes/datetimes/methods/test_to_period.py @@ -221,5 +221,5 @@ def test_to_period_offsets_not_supported(self, freq): # GH#56243 msg = f"{freq[1:]} is not supported as period frequency" ts = date_range("1/1/2012", periods=4, freq=freq) - with pytest.raises(TypeError, match=msg): + with pytest.raises(ValueError, match=msg): ts.to_period() diff --git a/pandas/tests/indexes/period/methods/test_asfreq.py b/pandas/tests/indexes/period/methods/test_asfreq.py index ed078a3e8fb8b..865bae69d91c7 100644 --- a/pandas/tests/indexes/period/methods/test_asfreq.py +++ b/pandas/tests/indexes/period/methods/test_asfreq.py @@ -1,3 +1,5 @@ +import re + import pytest from pandas import ( @@ -7,6 +9,8 @@ ) import pandas._testing as tm +from pandas.tseries import offsets + class TestPeriodIndex: def test_asfreq(self): @@ -136,3 +140,50 @@ def test_asfreq_with_different_n(self): excepted = Series([1, 2], index=PeriodIndex(["2020-02", "2020-04"], freq="M")) tm.assert_series_equal(result, excepted) + + @pytest.mark.parametrize( + "freq", + [ + "2BMS", + "2YS-MAR", + "2bh", + ], + ) + def test_pi_asfreq_not_supported_frequency(self, freq): + # GH#55785 + msg = f"{freq[1:]} is not supported as period frequency" + + pi = PeriodIndex(["2020-01-01", "2021-01-01"], freq="M") + with pytest.raises(ValueError, match=msg): + pi.asfreq(freq=freq) + + @pytest.mark.parametrize( + "freq", + [ + "2BME", + "2YE-MAR", + "2QE", + ], + ) + def test_pi_asfreq_invalid_frequency(self, freq): + # GH#55785 + msg = f"Invalid frequency: {freq}" + + pi = PeriodIndex(["2020-01-01", "2021-01-01"], freq="M") + with pytest.raises(ValueError, match=msg): + pi.asfreq(freq=freq) + + @pytest.mark.parametrize( + "freq", + [ + offsets.MonthBegin(2), + offsets.BusinessMonthEnd(2), + ], + ) + def test_pi_asfreq_invalid_baseoffset(self, freq): + # GH#56945 + msg = re.escape(f"{freq} is not supported as period frequency") + + pi = PeriodIndex(["2020-01-01", "2021-01-01"], freq="M") + with pytest.raises(ValueError, match=msg): + pi.asfreq(freq=freq) diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index 451d2a83c1d5e..6b7cce7d15a5b 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -1040,8 +1040,8 @@ def test_resample_lowercase_frequency_deprecated( offsets.BusinessHour(2), ], ) - def test_asfreq_invalid_period_freq(self, offset, series_and_frame): - # GH#9586 + def test_asfreq_invalid_period_offset(self, offset, series_and_frame): + # GH#55785 msg = f"Invalid offset: '{offset.base}' for converting time series " df = series_and_frame diff --git a/pandas/tests/scalar/period/test_asfreq.py b/pandas/tests/scalar/period/test_asfreq.py index 4489c307172d7..73c4d8061c257 100644 --- a/pandas/tests/scalar/period/test_asfreq.py +++ b/pandas/tests/scalar/period/test_asfreq.py @@ -820,10 +820,9 @@ def test_asfreq_MS(self): assert initial.asfreq(freq="M", how="S") == Period("2013-01", "M") - msg = INVALID_FREQ_ERR_MSG + msg = "MS is not supported as period frequency" with pytest.raises(ValueError, match=msg): initial.asfreq(freq="MS", how="S") - msg = "MonthBegin is not supported as period frequency" - with pytest.raises(TypeError, match=msg): + with pytest.raises(ValueError, match=msg): Period("2013-01", "MS") diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index d819e903a0bae..2c3a0816737fc 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -3,6 +3,7 @@ datetime, timedelta, ) +import re import numpy as np import pytest @@ -40,21 +41,22 @@ class TestPeriodDisallowedFreqs: ) def test_offsets_not_supported(self, freq, freq_msg): # GH#55785 - msg = f"{freq_msg} is not supported as period frequency" - with pytest.raises(TypeError, match=msg): + msg = re.escape(f"{freq} is not supported as period frequency") + with pytest.raises(ValueError, match=msg): Period(year=2014, freq=freq) def test_custom_business_day_freq_raises(self): # GH#52534 - msg = "CustomBusinessDay is not supported as period frequency" - with pytest.raises(TypeError, match=msg): + msg = "C is not supported as period frequency" + with pytest.raises(ValueError, match=msg): Period("2023-04-10", freq="C") - with pytest.raises(TypeError, match=msg): + msg = f"{offsets.CustomBusinessDay().base} is not supported as period frequency" + with pytest.raises(ValueError, match=msg): Period("2023-04-10", freq=offsets.CustomBusinessDay()) def test_invalid_frequency_error_message(self): - msg = "WeekOfMonth is not supported as period frequency" - with pytest.raises(TypeError, match=msg): + msg = "WOM-1MON is not supported as period frequency" + with pytest.raises(ValueError, match=msg): Period("2012-01-02", freq="WOM-1MON") def test_invalid_frequency_period_error_message(self): From 11a6136ce2efc25052849ff760b260ca49e371f0 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 7 Feb 2024 19:02:06 +0100 Subject: [PATCH 30/60] Backport PR #57233 on branch 2.2.x (REGR: Fix to_numpy conversion for arrow ea with float dtype given) (#57294) Backport PR #57233: REGR: Fix to_numpy conversion for arrow ea with float dtype given Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/arrays/_utils.py | 2 ++ pandas/tests/arrays/boolean/test_construction.py | 2 -- pandas/tests/arrays/floating/test_to_numpy.py | 8 ++++---- pandas/tests/arrays/integer/test_dtypes.py | 2 +- pandas/tests/series/methods/test_to_numpy.py | 11 +++++++++++ 6 files changed, 19 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 3f70c72a55a4c..883627bd4b19b 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -25,6 +25,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) - Fixed regression in :meth:`Index.join` raising ``TypeError`` when joining an empty index to a non-empty index containing mixed dtype values (:issue:`57048`) - Fixed regression in :meth:`Series.pct_change` raising a ``ValueError`` for an empty :class:`Series` (:issue:`57056`) +- Fixed regression in :meth:`Series.to_numpy` when dtype is given as float and the data contains NaNs (:issue:`57121`) .. --------------------------------------------------------------------------- .. _whatsnew_221.bug_fixes: diff --git a/pandas/core/arrays/_utils.py b/pandas/core/arrays/_utils.py index c75ec7f843ed2..88091a88a4e12 100644 --- a/pandas/core/arrays/_utils.py +++ b/pandas/core/arrays/_utils.py @@ -39,6 +39,8 @@ def to_numpy_dtype_inference( dtype = arr.dtype.numpy_dtype # type: ignore[union-attr] elif dtype is not None: dtype = np.dtype(dtype) + if na_value is lib.no_default and hasna and dtype.kind == "f": + na_value = np.nan dtype_given = True else: dtype_given = True diff --git a/pandas/tests/arrays/boolean/test_construction.py b/pandas/tests/arrays/boolean/test_construction.py index a5a2dd33940b8..645e763fbf00c 100644 --- a/pandas/tests/arrays/boolean/test_construction.py +++ b/pandas/tests/arrays/boolean/test_construction.py @@ -308,8 +308,6 @@ def test_to_numpy(box): # converting to int or float without specifying na_value raises with pytest.raises(ValueError, match="cannot convert to 'int64'-dtype"): arr.to_numpy(dtype="int64") - with pytest.raises(ValueError, match="cannot convert to 'float64'-dtype"): - arr.to_numpy(dtype="float64") def test_to_numpy_copy(): diff --git a/pandas/tests/arrays/floating/test_to_numpy.py b/pandas/tests/arrays/floating/test_to_numpy.py index a25ac40cb3e7c..e954cecba417a 100644 --- a/pandas/tests/arrays/floating/test_to_numpy.py +++ b/pandas/tests/arrays/floating/test_to_numpy.py @@ -33,10 +33,10 @@ def test_to_numpy_float(box): tm.assert_numpy_array_equal(result, expected) arr = con([0.1, 0.2, None], dtype="Float64") - with pytest.raises(ValueError, match="cannot convert to 'float64'-dtype"): - result = arr.to_numpy(dtype="float64") + result = arr.to_numpy(dtype="float64") + expected = np.array([0.1, 0.2, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) - # need to explicitly specify na_value result = arr.to_numpy(dtype="float64", na_value=np.nan) expected = np.array([0.1, 0.2, np.nan], dtype="float64") tm.assert_numpy_array_equal(result, expected) @@ -100,7 +100,7 @@ def test_to_numpy_dtype(box, dtype): tm.assert_numpy_array_equal(result, expected) -@pytest.mark.parametrize("dtype", ["float64", "float32", "int32", "int64", "bool"]) +@pytest.mark.parametrize("dtype", ["int32", "int64", "bool"]) @pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) def test_to_numpy_na_raises(box, dtype): con = pd.Series if box else pd.array diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py index e3848cdfe3aa9..8620763988e06 100644 --- a/pandas/tests/arrays/integer/test_dtypes.py +++ b/pandas/tests/arrays/integer/test_dtypes.py @@ -271,7 +271,7 @@ def test_to_numpy_dtype(dtype, in_series): tm.assert_numpy_array_equal(result, expected) -@pytest.mark.parametrize("dtype", ["float64", "int64", "bool"]) +@pytest.mark.parametrize("dtype", ["int64", "bool"]) def test_to_numpy_na_raises(dtype): a = pd.array([0, 1, None], dtype="Int64") with pytest.raises(ValueError, match=dtype): diff --git a/pandas/tests/series/methods/test_to_numpy.py b/pandas/tests/series/methods/test_to_numpy.py index 5fe3e19b0a20b..8dcc1dd551315 100644 --- a/pandas/tests/series/methods/test_to_numpy.py +++ b/pandas/tests/series/methods/test_to_numpy.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import ( NA, Series, @@ -23,3 +25,12 @@ def test_to_numpy_cast_before_setting_na(): result = ser.to_numpy(dtype=np.float64, na_value=np.nan) expected = np.array([1.0]) tm.assert_numpy_array_equal(result, expected) + + +@td.skip_if_no("pyarrow") +def test_to_numpy_arrow_dtype_given(): + # GH#57121 + ser = Series([1, NA], dtype="int64[pyarrow]") + result = ser.to_numpy(dtype="float64") + expected = np.array([1.0, np.nan]) + tm.assert_numpy_array_equal(result, expected) From bbc655d782c2c62ded6b4e305234cbce23ac9f22 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 10 Feb 2024 00:29:56 +0100 Subject: [PATCH 31/60] Backport PR #57121 on branch 2.2.x (REGR: Fix to_numpy for masked array with non-numeric dtype) (#57325) Backport PR #57121: REGR: Fix to_numpy for masked array with non-numeric dtype Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 3 +++ pandas/core/arrays/masked.py | 2 ++ pandas/tests/arrays/masked/test_function.py | 17 +++++++++++++++++ 3 files changed, 22 insertions(+) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 883627bd4b19b..009d4794dbd1d 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -17,12 +17,15 @@ Fixed regressions - Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`) - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) - Fixed regression in :func:`wide_to_long` raising an ``AttributeError`` for string columns (:issue:`57066`) +- Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) +- Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) - Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`) - Fixed regression in :meth:`DataFrame.sort_index` not producing a stable sort for a index with duplicates (:issue:`57151`) - Fixed regression in :meth:`DataFrame.to_dict` with ``orient='list'`` and datetime or timedelta types returning integers (:issue:`54824`) - Fixed regression in :meth:`DataFrame.to_json` converting nullable integers to floats (:issue:`57224`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) +- Fixed regression in :meth:`ExtensionArray.to_numpy` raising for non-numeric masked dtypes (:issue:`56991`) - Fixed regression in :meth:`Index.join` raising ``TypeError`` when joining an empty index to a non-empty index containing mixed dtype values (:issue:`57048`) - Fixed regression in :meth:`Series.pct_change` raising a ``ValueError`` for an empty :class:`Series` (:issue:`57056`) - Fixed regression in :meth:`Series.to_numpy` when dtype is given as float and the data contains NaNs (:issue:`57121`) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 8c41de9c9fffa..7c49ce5343158 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -501,6 +501,8 @@ def to_numpy( """ hasna = self._hasna dtype, na_value = to_numpy_dtype_inference(self, dtype, na_value, hasna) + if dtype is None: + dtype = object if hasna: if ( diff --git a/pandas/tests/arrays/masked/test_function.py b/pandas/tests/arrays/masked/test_function.py index 4c7bd6e293ef4..b259018cd6121 100644 --- a/pandas/tests/arrays/masked/test_function.py +++ b/pandas/tests/arrays/masked/test_function.py @@ -5,6 +5,7 @@ import pandas as pd import pandas._testing as tm +from pandas.core.arrays import BaseMaskedArray arrays = [pd.array([1, 2, 3, None], dtype=dtype) for dtype in tm.ALL_INT_EA_DTYPES] arrays += [ @@ -55,3 +56,19 @@ def test_tolist(data): result = data.tolist() expected = list(data) tm.assert_equal(result, expected) + + +def test_to_numpy(): + # GH#56991 + + class MyStringArray(BaseMaskedArray): + dtype = pd.StringDtype() + _dtype_cls = pd.StringDtype + _internal_fill_value = pd.NA + + arr = MyStringArray( + values=np.array(["a", "b", "c"]), mask=np.array([False, True, False]) + ) + result = arr.to_numpy() + expected = np.array(["a", pd.NA, "c"]) + tm.assert_numpy_array_equal(result, expected) From 361b0899dfd66cdc8eebcfb0ef81285cf126d8a1 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 10 Feb 2024 00:30:08 +0100 Subject: [PATCH 32/60] Backport PR #57250 on branch 2.2.x (REGR/DOC: pd.concat should special case DatetimeIndex to sort even when sort=False) (#57326) Backport PR #57250: REGR/DOC: pd.concat should special case DatetimeIndex to sort even when sort=False Co-authored-by: Luke Manley --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/indexes/api.py | 1 + pandas/core/reshape/concat.py | 6 ++++-- pandas/tests/reshape/concat/test_datetimes.py | 12 ++++++------ 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 009d4794dbd1d..69e14a9028dd3 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -15,6 +15,7 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed memory leak in :func:`read_csv` (:issue:`57039`) - Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`) +- Fixed regression in :func:`concat` changing long-standing behavior that always sorted the non-concatenation axis when the axis was a :class:`DatetimeIndex` (:issue:`57006`) - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) - Fixed regression in :func:`wide_to_long` raising an ``AttributeError`` for string columns (:issue:`57066`) - Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 560285bd57a22..15292953e72d0 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -295,6 +295,7 @@ def _find_common_index_dtype(inds): raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") if len(dtis) == len(indexes): + sort = True result = indexes[0] elif len(dtis) > 1: diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index aacea92611697..dc18bb65b35bc 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -205,8 +205,10 @@ def concat( Check whether the new concatenated axis contains duplicates. This can be very expensive relative to the actual data concatenation. sort : bool, default False - Sort non-concatenation axis if it is not already aligned. - + Sort non-concatenation axis if it is not already aligned. One exception to + this is when the non-concatentation axis is a DatetimeIndex and join='outer' + and the axis is not already aligned. In that case, the non-concatenation + axis is always sorted lexicographically. copy : bool, default True If False, do not copy data unnecessarily. diff --git a/pandas/tests/reshape/concat/test_datetimes.py b/pandas/tests/reshape/concat/test_datetimes.py index 71ddff7438254..4c94dc0d51f7e 100644 --- a/pandas/tests/reshape/concat/test_datetimes.py +++ b/pandas/tests/reshape/concat/test_datetimes.py @@ -73,23 +73,23 @@ def test_concat_datetime_timezone(self): exp_idx = DatetimeIndex( [ - "2010-12-31 23:00:00+00:00", - "2011-01-01 00:00:00+00:00", - "2011-01-01 01:00:00+00:00", "2010-12-31 15:00:00+00:00", "2010-12-31 16:00:00+00:00", "2010-12-31 17:00:00+00:00", + "2010-12-31 23:00:00+00:00", + "2011-01-01 00:00:00+00:00", + "2011-01-01 01:00:00+00:00", ] ).as_unit("ns") expected = DataFrame( [ - [1, np.nan], - [2, np.nan], - [3, np.nan], [np.nan, 1], [np.nan, 2], [np.nan, 3], + [1, np.nan], + [2, np.nan], + [3, np.nan], ], index=exp_idx, columns=["a", "b"], From 044342727b2b7efacefa2c2dc485193897c47b0c Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 10 Feb 2024 12:25:24 +0100 Subject: [PATCH 33/60] Backport PR #57322 on branch 2.2.x (REGR: Fix astype conversion of ea int to td/dt with missing values) (#57331) Backport PR #57322: REGR: Fix astype conversion of ea int to td/dt with missing values Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/core/arrays/_utils.py | 13 ++++++++++--- pandas/tests/extension/test_arrow.py | 9 +++++++++ pandas/tests/series/methods/test_to_numpy.py | 13 +++++++++++++ 3 files changed, 32 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/_utils.py b/pandas/core/arrays/_utils.py index 88091a88a4e12..6b46396d5efdf 100644 --- a/pandas/core/arrays/_utils.py +++ b/pandas/core/arrays/_utils.py @@ -39,14 +39,21 @@ def to_numpy_dtype_inference( dtype = arr.dtype.numpy_dtype # type: ignore[union-attr] elif dtype is not None: dtype = np.dtype(dtype) - if na_value is lib.no_default and hasna and dtype.kind == "f": - na_value = np.nan dtype_given = True else: dtype_given = True if na_value is lib.no_default: - na_value = arr.dtype.na_value + if dtype is None or not hasna: + na_value = arr.dtype.na_value + elif dtype.kind == "f": # type: ignore[union-attr] + na_value = np.nan + elif dtype.kind == "M": # type: ignore[union-attr] + na_value = np.datetime64("nat") + elif dtype.kind == "m": # type: ignore[union-attr] + na_value = np.timedelta64("nat") + else: + na_value = arr.dtype.na_value if not dtype_given and hasna: try: diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 05a112e464677..e041093bbf5bc 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3305,6 +3305,15 @@ def test_arrow_floordiv_floating_0_divisor(dtype): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("dtype", ["float64", "datetime64[ns]", "timedelta64[ns]"]) +def test_astype_int_with_null_to_numpy_dtype(dtype): + # GH 57093 + ser = pd.Series([1, None], dtype="int64[pyarrow]") + result = ser.astype(dtype) + expected = pd.Series([1, None], dtype=dtype) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("pa_type", tm.ALL_INT_PYARROW_DTYPES) def test_arrow_integral_floordiv_large_values(pa_type): # GH 56676 diff --git a/pandas/tests/series/methods/test_to_numpy.py b/pandas/tests/series/methods/test_to_numpy.py index 8dcc1dd551315..4bc7631090761 100644 --- a/pandas/tests/series/methods/test_to_numpy.py +++ b/pandas/tests/series/methods/test_to_numpy.py @@ -6,6 +6,7 @@ from pandas import ( NA, Series, + Timedelta, ) import pandas._testing as tm @@ -34,3 +35,15 @@ def test_to_numpy_arrow_dtype_given(): result = ser.to_numpy(dtype="float64") expected = np.array([1.0, np.nan]) tm.assert_numpy_array_equal(result, expected) + + +def test_astype_ea_int_to_td_ts(): + # GH#57093 + ser = Series([1, None], dtype="Int64") + result = ser.astype("m8[ns]") + expected = Series([1, Timedelta("nat")], dtype="m8[ns]") + tm.assert_series_equal(result, expected) + + result = ser.astype("M8[ns]") + expected = Series([1, Timedelta("nat")], dtype="M8[ns]") + tm.assert_series_equal(result, expected) From 28a7ec7d34be480b9a14e88236500fd74a2e0109 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 10 Feb 2024 16:06:36 +0100 Subject: [PATCH 34/60] Backport PR #57329 on branch 2.2.x (REGR: CategoricalIndex.difference with null values) (#57336) Backport PR #57329: REGR: CategoricalIndex.difference with null values Co-authored-by: Luke Manley --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/indexes/base.py | 7 +++++-- .../tests/indexes/categorical/test_setops.py | 18 ++++++++++++++++++ 3 files changed, 24 insertions(+), 2 deletions(-) create mode 100644 pandas/tests/indexes/categorical/test_setops.py diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 69e14a9028dd3..335dada439029 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -20,6 +20,7 @@ Fixed regressions - Fixed regression in :func:`wide_to_long` raising an ``AttributeError`` for string columns (:issue:`57066`) - Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) - Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) +- Fixed regression in :meth:`CategoricalIndex.difference` raising ``KeyError`` when other contains null values other than NaN (:issue:`57318`) - Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`) - Fixed regression in :meth:`DataFrame.sort_index` not producing a stable sort for a index with duplicates (:issue:`57151`) - Fixed regression in :meth:`DataFrame.to_dict` with ``orient='list'`` and datetime or timedelta types returning integers (:issue:`54824`) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 6f9078bf5a2a2..4b3d1a9e006dc 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3663,9 +3663,12 @@ def difference(self, other, sort=None): def _difference(self, other, sort): # overridden by RangeIndex + this = self + if isinstance(self, ABCCategoricalIndex) and self.hasnans and other.hasnans: + this = this.dropna() other = other.unique() - the_diff = self[other.get_indexer_for(self) == -1] - the_diff = the_diff if self.is_unique else the_diff.unique() + the_diff = this[other.get_indexer_for(this) == -1] + the_diff = the_diff if this.is_unique else the_diff.unique() the_diff = _maybe_try_sort(the_diff, sort) return the_diff diff --git a/pandas/tests/indexes/categorical/test_setops.py b/pandas/tests/indexes/categorical/test_setops.py new file mode 100644 index 0000000000000..2e87b90efd54c --- /dev/null +++ b/pandas/tests/indexes/categorical/test_setops.py @@ -0,0 +1,18 @@ +import numpy as np +import pytest + +from pandas import ( + CategoricalIndex, + Index, +) +import pandas._testing as tm + + +@pytest.mark.parametrize("na_value", [None, np.nan]) +def test_difference_with_na(na_value): + # GH 57318 + ci = CategoricalIndex(["a", "b", "c", None]) + other = Index(["c", na_value]) + result = ci.difference(other) + expected = CategoricalIndex(["a", "b"], categories=["a", "b", "c"]) + tm.assert_index_equal(result, expected) From 10b26fefce26e1b2d29d2f7ca39be3b28d875def Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sat, 10 Feb 2024 16:09:36 -0500 Subject: [PATCH 35/60] Backport PR #57333 on branch 2.2.x (REGR: merge with 3rd party EA's can sometimes raise ValueError) (#57337) Backport PR #57333: REGR: merge with 3rd party EA's can sometimes raise ValueError --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/_libs/index.pyx | 2 +- pandas/tests/indexes/test_base.py | 10 ++++++++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 335dada439029..b342eef850459 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -22,6 +22,7 @@ Fixed regressions - Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) - Fixed regression in :meth:`CategoricalIndex.difference` raising ``KeyError`` when other contains null values other than NaN (:issue:`57318`) - Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`) +- Fixed regression in :meth:`DataFrame.merge` raising ``ValueError`` for certain types of 3rd-party extension arrays (:issue:`57316`) - Fixed regression in :meth:`DataFrame.sort_index` not producing a stable sort for a index with duplicates (:issue:`57151`) - Fixed regression in :meth:`DataFrame.to_dict` with ``orient='list'`` and datetime or timedelta types returning integers (:issue:`54824`) - Fixed regression in :meth:`DataFrame.to_json` converting nullable integers to floats (:issue:`57224`) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index e4dfe9dec3623..ee6a11ddab004 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -295,7 +295,7 @@ cdef class IndexEngine: values = self.values self.monotonic_inc, self.monotonic_dec, is_strict_monotonic = \ self._call_monotonic(values) - except TypeError: + except (TypeError, ValueError): self.monotonic_inc = 0 self.monotonic_dec = 0 is_strict_monotonic = 0 diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 666d92064c86c..1fa48f98942c2 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1722,3 +1722,13 @@ def test_nan_comparison_same_object(op): result = op(idx, idx.copy()) tm.assert_numpy_array_equal(result, expected) + + +@td.skip_if_no("pyarrow") +def test_is_monotonic_pyarrow_list_type(): + # GH 57333 + import pyarrow as pa + + idx = Index([[1], [2, 3]], dtype=pd.ArrowDtype(pa.list_(pa.int64()))) + assert not idx.is_monotonic_increasing + assert not idx.is_monotonic_decreasing From 947f5aeda9d924ce7d93a46457574123bb7fb07a Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 10 Feb 2024 22:58:09 +0100 Subject: [PATCH 36/60] Backport PR #57323 on branch 2.2.x (REGR: Fix regression when grouping over a Series) (#57339) Backport PR #57323: REGR: Fix regression when grouping over a Series Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/internals/managers.py | 5 ++--- pandas/tests/copy_view/test_methods.py | 11 +++++++++++ 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index b342eef850459..eb6e50c1be160 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -21,6 +21,7 @@ Fixed regressions - Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) - Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) - Fixed regression in :meth:`CategoricalIndex.difference` raising ``KeyError`` when other contains null values other than NaN (:issue:`57318`) +- Fixed regression in :meth:`DataFrame.groupby` raising ``ValueError`` when grouping by a :class:`Series` in some cases (:issue:`57276`) - Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`) - Fixed regression in :meth:`DataFrame.merge` raising ``ValueError`` for certain types of 3rd-party extension arrays (:issue:`57316`) - Fixed regression in :meth:`DataFrame.sort_index` not producing a stable sort for a index with duplicates (:issue:`57151`) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 3719bf1f77f85..220bb1133dfd5 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -12,7 +12,6 @@ cast, ) import warnings -import weakref import numpy as np @@ -282,8 +281,8 @@ def references_same_values(self, mgr: BaseBlockManager, blkno: int) -> bool: Checks if two blocks from two different block managers reference the same underlying values. """ - ref = weakref.ref(self.blocks[blkno]) - return ref in mgr.blocks[blkno].refs.referenced_blocks + blk = self.blocks[blkno] + return any(blk is ref() for ref in mgr.blocks[blkno].refs.referenced_blocks) def get_dtypes(self) -> npt.NDArray[np.object_]: dtypes = np.array([blk.dtype for blk in self.blocks], dtype=object) diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 862aebdc70a9d..5d1eefccbb1e7 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -280,6 +280,17 @@ def test_reset_index_series_drop(using_copy_on_write, index): tm.assert_series_equal(ser, ser_orig) +def test_groupby_column_index_in_references(): + df = DataFrame( + {"A": ["a", "b", "c", "d"], "B": [1, 2, 3, 4], "C": ["a", "a", "b", "b"]} + ) + df = df.set_index("A") + key = df["C"] + result = df.groupby(key, observed=True).sum() + expected = df.groupby("C", observed=True).sum() + tm.assert_frame_equal(result, expected) + + def test_rename_columns(using_copy_on_write): # Case: renaming columns returns a new dataframe # + afterwards modifying the result From 58b182b0e823e25081b9a6df809d45b203733d03 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 12 Feb 2024 21:33:18 +0100 Subject: [PATCH 37/60] Backport PR #57341 on branch 2.2.x (REGR: assert_series_equal defaulting to check_exact=True for Index) (#57380) Backport PR #57341: REGR: assert_series_equal defaulting to check_exact=True for Index Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/_testing/asserters.py | 3 ++- pandas/tests/util/test_assert_series_equal.py | 8 ++++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index eb6e50c1be160..47ca96c6eef44 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -17,6 +17,7 @@ Fixed regressions - Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`) - Fixed regression in :func:`concat` changing long-standing behavior that always sorted the non-concatenation axis when the axis was a :class:`DatetimeIndex` (:issue:`57006`) - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) +- Fixed regression in :func:`pandas.testing.assert_series_equal` defaulting to ``check_exact=True`` when checking the :class:`Index` (:issue:`57067`) - Fixed regression in :func:`wide_to_long` raising an ``AttributeError`` for string columns (:issue:`57066`) - Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) - Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index 3de982498e996..41d2a7344a4ed 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -902,6 +902,7 @@ def assert_series_equal( >>> tm.assert_series_equal(a, b) """ __tracebackhide__ = True + check_exact_index = False if check_exact is lib.no_default else check_exact if ( check_exact is lib.no_default and rtol is lib.no_default @@ -944,7 +945,7 @@ def assert_series_equal( right.index, exact=check_index_type, check_names=check_names, - check_exact=check_exact, + check_exact=check_exact_index, check_categorical=check_categorical, check_order=not check_like, rtol=rtol, diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index 784a0347cf92b..1878e7d838064 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -474,3 +474,11 @@ def test_assert_series_equal_int_tol(): tm.assert_extension_array_equal( left.astype("Int64").values, right.astype("Int64").values, rtol=1.5 ) + + +def test_assert_series_equal_index_exact_default(): + # GH#57067 + ser1 = Series(np.zeros(6, dtype=int), [0, 0.2, 0.4, 0.6, 0.8, 1]) + ser2 = Series(np.zeros(6, dtype=int), np.linspace(0, 1, 6)) + tm.assert_series_equal(ser1, ser2) + tm.assert_frame_equal(ser1.to_frame(), ser2.to_frame()) From 169bb9cf8ca6ee7adbc9e1328e9fff378793ecd0 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 12 Feb 2024 21:33:27 +0100 Subject: [PATCH 38/60] Backport PR #57340 on branch 2.2.x (REGR: shift raising for axis=1 and empty df) (#57381) Backport PR #57340: REGR: shift raising for axis=1 and empty df Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/frame.py | 3 +++ pandas/tests/frame/methods/test_shift.py | 6 ++++++ 3 files changed, 10 insertions(+) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 47ca96c6eef44..94e26ff6aa46a 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -25,6 +25,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.groupby` raising ``ValueError`` when grouping by a :class:`Series` in some cases (:issue:`57276`) - Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`) - Fixed regression in :meth:`DataFrame.merge` raising ``ValueError`` for certain types of 3rd-party extension arrays (:issue:`57316`) +- Fixed regression in :meth:`DataFrame.shift` raising ``AssertionError`` for ``axis=1`` and empty :class:`DataFrame` (:issue:`57301`) - Fixed regression in :meth:`DataFrame.sort_index` not producing a stable sort for a index with duplicates (:issue:`57151`) - Fixed regression in :meth:`DataFrame.to_dict` with ``orient='list'`` and datetime or timedelta types returning integers (:issue:`54824`) - Fixed regression in :meth:`DataFrame.to_json` converting nullable integers to floats (:issue:`57224`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 734756cb8f7c8..b531ddc418df1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5859,6 +5859,9 @@ def shift( ) fill_value = lib.no_default + if self.empty: + return self.copy() + axis = self._get_axis_number(axis) if is_list_like(periods): diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index b21aa2d687682..abb30595fdcb8 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -756,3 +756,9 @@ def test_shift_with_iterable_check_other_arguments(self): msg = "Cannot specify `suffix` if `periods` is an int." with pytest.raises(ValueError, match=msg): df.shift(1, suffix="fails") + + def test_shift_axis_one_empty(self): + # GH#57301 + df = DataFrame() + result = df.shift(1, axis=1) + tm.assert_frame_equal(result, df) From 09debec463762f209787534dc335b3bb7cd3961f Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 12 Feb 2024 22:50:47 +0100 Subject: [PATCH 39/60] Backport PR #57379 on branch 2.2.x (Fix numpy-dev CI warnings) (#57383) Backport PR #57379: Fix numpy-dev CI warnings Co-authored-by: William Ayd --- .../src/vendored/ujson/python/objToJSON.c | 44 +++++++++++++++---- 1 file changed, 36 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c index 8bba95dd456de..74ca8ead3d936 100644 --- a/pandas/_libs/src/vendored/ujson/python/objToJSON.c +++ b/pandas/_libs/src/vendored/ujson/python/objToJSON.c @@ -447,8 +447,15 @@ static void NpyArrPassThru_iterEnd(JSOBJ obj, JSONTypeContext *tc) { npyarr->curdim--; npyarr->dataptr -= npyarr->stride * npyarr->index[npyarr->stridedim]; npyarr->stridedim -= npyarr->inc; - npyarr->dim = PyArray_DIM(npyarr->array, npyarr->stridedim); - npyarr->stride = PyArray_STRIDE(npyarr->array, npyarr->stridedim); + + if (!PyArray_Check(npyarr->array)) { + PyErr_SetString(PyExc_TypeError, + "NpyArrayPassThru_iterEnd received a non-array object"); + return; + } + const PyArrayObject *arrayobj = (const PyArrayObject *)npyarr->array; + npyarr->dim = PyArray_DIM(arrayobj, npyarr->stridedim); + npyarr->stride = PyArray_STRIDE(arrayobj, npyarr->stridedim); npyarr->dataptr += npyarr->stride; NpyArr_freeItemValue(obj, tc); @@ -467,12 +474,19 @@ static int NpyArr_iterNextItem(JSOBJ obj, JSONTypeContext *tc) { NpyArr_freeItemValue(obj, tc); - if (PyArray_ISDATETIME(npyarr->array)) { + if (!PyArray_Check(npyarr->array)) { + PyErr_SetString(PyExc_TypeError, + "NpyArr_iterNextItem received a non-array object"); + return 0; + } + PyArrayObject *arrayobj = (PyArrayObject *)npyarr->array; + + if (PyArray_ISDATETIME(arrayobj)) { GET_TC(tc)->itemValue = obj; Py_INCREF(obj); - ((PyObjectEncoder *)tc->encoder)->npyType = PyArray_TYPE(npyarr->array); + ((PyObjectEncoder *)tc->encoder)->npyType = PyArray_TYPE(arrayobj); // Also write the resolution (unit) of the ndarray - PyArray_Descr *dtype = PyArray_DESCR(npyarr->array); + PyArray_Descr *dtype = PyArray_DESCR(arrayobj); ((PyObjectEncoder *)tc->encoder)->valueUnit = get_datetime_metadata_from_dtype(dtype).base; ((PyObjectEncoder *)tc->encoder)->npyValue = npyarr->dataptr; @@ -505,8 +519,15 @@ static int NpyArr_iterNext(JSOBJ _obj, JSONTypeContext *tc) { npyarr->curdim++; npyarr->stridedim += npyarr->inc; - npyarr->dim = PyArray_DIM(npyarr->array, npyarr->stridedim); - npyarr->stride = PyArray_STRIDE(npyarr->array, npyarr->stridedim); + if (!PyArray_Check(npyarr->array)) { + PyErr_SetString(PyExc_TypeError, + "NpyArr_iterNext received a non-array object"); + return 0; + } + const PyArrayObject *arrayobj = (const PyArrayObject *)npyarr->array; + + npyarr->dim = PyArray_DIM(arrayobj, npyarr->stridedim); + npyarr->stride = PyArray_STRIDE(arrayobj, npyarr->stridedim); npyarr->index[npyarr->stridedim] = 0; ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr; @@ -1610,7 +1631,14 @@ static void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { if (!values) { goto INVALID; } - pc->columnLabelsLen = PyArray_DIM(pc->newObj, 0); + + if (!PyArray_Check(pc->newObj)) { + PyErr_SetString(PyExc_TypeError, + "Object_beginTypeContext received a non-array object"); + goto INVALID; + } + const PyArrayObject *arrayobj = (const PyArrayObject *)pc->newObj; + pc->columnLabelsLen = PyArray_DIM(arrayobj, 0); pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, pc->columnLabelsLen); if (!pc->columnLabels) { From 3dca4f0f8d583c67d2df56bf9ba51839cdaa6d26 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 14 Feb 2024 03:12:16 +0100 Subject: [PATCH 40/60] Backport PR #57388 on branch 2.2.x (BUG: map(na_action=ignore) not respected for Arrow & masked types) (#57413) Backport PR #57388: BUG: map(na_action=ignore) not respected for Arrow & masked types Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/arrays/arrow/array.py | 2 +- pandas/core/arrays/masked.py | 2 +- pandas/tests/extension/test_arrow.py | 7 +++++++ pandas/tests/extension/test_masked.py | 9 +++++++++ 5 files changed, 19 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 94e26ff6aa46a..9733aff0e6eb5 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -24,6 +24,7 @@ Fixed regressions - Fixed regression in :meth:`CategoricalIndex.difference` raising ``KeyError`` when other contains null values other than NaN (:issue:`57318`) - Fixed regression in :meth:`DataFrame.groupby` raising ``ValueError`` when grouping by a :class:`Series` in some cases (:issue:`57276`) - Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`) +- Fixed regression in :meth:`DataFrame.map` with ``na_action="ignore"`` not being respected for NumPy nullable and :class:`ArrowDtypes` (:issue:`57316`) - Fixed regression in :meth:`DataFrame.merge` raising ``ValueError`` for certain types of 3rd-party extension arrays (:issue:`57316`) - Fixed regression in :meth:`DataFrame.shift` raising ``AssertionError`` for ``axis=1`` and empty :class:`DataFrame` (:issue:`57301`) - Fixed regression in :meth:`DataFrame.sort_index` not producing a stable sort for a index with duplicates (:issue:`57151`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 621a32a049f3b..f8b07bd73d315 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1414,7 +1414,7 @@ def to_numpy( def map(self, mapper, na_action=None): if is_numeric_dtype(self.dtype): - return map_array(self.to_numpy(), mapper, na_action=None) + return map_array(self.to_numpy(), mapper, na_action=na_action) else: return super().map(mapper, na_action) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 7c49ce5343158..0bc0d9f8d7a7d 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1333,7 +1333,7 @@ def max(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): return self._wrap_reduction_result("max", result, skipna=skipna, axis=axis) def map(self, mapper, na_action=None): - return map_array(self.to_numpy(), mapper, na_action=None) + return map_array(self.to_numpy(), mapper, na_action=na_action) def any(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs): """ diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index e041093bbf5bc..d9a3033b8380e 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3379,3 +3379,10 @@ def test_to_numpy_timestamp_to_int(): result = ser.to_numpy(dtype=np.int64) expected = np.array([1577853000000000000]) tm.assert_numpy_array_equal(result, expected) + + +def test_map_numeric_na_action(): + ser = pd.Series([32, 40, None], dtype="int64[pyarrow]") + result = ser.map(lambda x: 42, na_action="ignore") + expected = pd.Series([42.0, 42.0, np.nan], dtype="float64") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/test_masked.py b/pandas/tests/extension/test_masked.py index 3efc561d6a125..651f783b44d1f 100644 --- a/pandas/tests/extension/test_masked.py +++ b/pandas/tests/extension/test_masked.py @@ -179,6 +179,15 @@ def test_map(self, data_missing, na_action): expected = data_missing.to_numpy() tm.assert_numpy_array_equal(result, expected) + def test_map_na_action_ignore(self, data_missing_for_sorting): + zero = data_missing_for_sorting[2] + result = data_missing_for_sorting.map(lambda x: zero, na_action="ignore") + if data_missing_for_sorting.dtype.kind == "b": + expected = np.array([False, pd.NA, False], dtype=object) + else: + expected = np.array([zero, np.nan, zero]) + tm.assert_numpy_array_equal(result, expected) + def _get_expected_exception(self, op_name, obj, other): try: dtype = tm.get_dtype(obj) From 11818adf539b701f7dd68ee46d920d05c52fd7ba Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 16 Feb 2024 18:26:32 +0100 Subject: [PATCH 41/60] Backport PR #57450 on branch 2.2.x (DOC: Set verbose parameter as deprecated in docstring) (#57455) Backport PR #57450: DOC: Set verbose parameter as deprecated in docstring Co-authored-by: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> --- pandas/io/parsers/readers.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index e26e7e7470461..e04f27b560610 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -240,6 +240,8 @@ performance of reading a large file. verbose : bool, default False Indicate number of ``NA`` values placed in non-numeric columns. + + .. deprecated:: 2.2.0 skip_blank_lines : bool, default True If ``True``, skip over blank lines rather than interpreting as ``NaN`` values. parse_dates : bool, list of Hashable, list of lists or dict of {{Hashable : list}}, \ From 5550bdb9bebe02ec57c54ca75835bbf12e05ceab Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 16 Feb 2024 20:21:45 +0100 Subject: [PATCH 42/60] Backport PR #57402 on branch 2.2.x (BUG: wrong future Warning on string assignment in certain condition) (#57460) Backport PR #57402: BUG: wrong future Warning on string assignment in certain condition Co-authored-by: Marco Edward Gorelli --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/dtypes/missing.py | 14 ++++++++++++ pandas/core/indexing.py | 25 +++++++++++++-------- pandas/tests/frame/indexing/test_setitem.py | 16 +++++++++++++ 4 files changed, 47 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 9733aff0e6eb5..5e814ec2a1b92 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -24,6 +24,7 @@ Fixed regressions - Fixed regression in :meth:`CategoricalIndex.difference` raising ``KeyError`` when other contains null values other than NaN (:issue:`57318`) - Fixed regression in :meth:`DataFrame.groupby` raising ``ValueError`` when grouping by a :class:`Series` in some cases (:issue:`57276`) - Fixed regression in :meth:`DataFrame.loc` raising ``IndexError`` for non-unique, masked dtype indexes where result has more than 10,000 rows (:issue:`57027`) +- Fixed regression in :meth:`DataFrame.loc` which was unnecessarily throwing "incompatible dtype warning" when expanding with partial row indexer and multiple columns (see `PDEP6 `_) (:issue:`56503`) - Fixed regression in :meth:`DataFrame.map` with ``na_action="ignore"`` not being respected for NumPy nullable and :class:`ArrowDtypes` (:issue:`57316`) - Fixed regression in :meth:`DataFrame.merge` raising ``ValueError`` for certain types of 3rd-party extension arrays (:issue:`57316`) - Fixed regression in :meth:`DataFrame.shift` raising ``AssertionError`` for ``axis=1`` and empty :class:`DataFrame` (:issue:`57301`) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 4dc0d477f89e8..655a53997620a 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -647,6 +647,20 @@ def infer_fill_value(val): return np.nan +def construct_1d_array_from_inferred_fill_value( + value: object, length: int +) -> ArrayLike: + # Find our empty_value dtype by constructing an array + # from our value and doing a .take on it + from pandas.core.algorithms import take_nd + from pandas.core.construction import sanitize_array + from pandas.core.indexes.base import Index + + arr = sanitize_array(value, Index(range(1)), copy=False) + taker = -1 * np.ones(length, dtype=np.intp) + return take_nd(arr, taker) + + def maybe_fill(arr: np.ndarray) -> np.ndarray: """ Fill numpy.ndarray with NaN, unless we have a integer or boolean dtype. diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 934ba3a4d7f29..869e511fc0720 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -57,6 +57,7 @@ ABCSeries, ) from pandas.core.dtypes.missing import ( + construct_1d_array_from_inferred_fill_value, infer_fill_value, is_valid_na_for_dtype, isna, @@ -68,7 +69,6 @@ from pandas.core.construction import ( array as pd_array, extract_array, - sanitize_array, ) from pandas.core.indexers import ( check_array_indexer, @@ -844,7 +844,6 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None) -> None: if self.ndim != 2: return - orig_key = key if isinstance(key, tuple) and len(key) > 1: # key may be a tuple if we are .loc # if length of key is > 1 set key to column part @@ -862,7 +861,7 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None) -> None: keys = self.obj.columns.union(key, sort=False) diff = Index(key).difference(self.obj.columns, sort=False) - if len(diff) and com.is_null_slice(orig_key[0]): + if len(diff): # e.g. if we are doing df.loc[:, ["A", "B"]] = 7 and "B" # is a new column, add the new columns with dtype=np.void # so that later when we go through setitem_single_column @@ -1878,12 +1877,9 @@ def _setitem_with_indexer(self, indexer, value, name: str = "iloc"): self.obj[key] = empty_value elif not is_list_like(value): - # Find our empty_value dtype by constructing an array - # from our value and doing a .take on it - arr = sanitize_array(value, Index(range(1)), copy=False) - taker = -1 * np.ones(len(self.obj), dtype=np.intp) - empty_value = algos.take_nd(arr, taker) - self.obj[key] = empty_value + self.obj[key] = construct_1d_array_from_inferred_fill_value( + value, len(self.obj) + ) else: # FIXME: GH#42099#issuecomment-864326014 self.obj[key] = infer_fill_value(value) @@ -2165,6 +2161,17 @@ def _setitem_single_column(self, loc: int, value, plane_indexer) -> None: else: # set value into the column (first attempting to operate inplace, then # falling back to casting if necessary) + dtype = self.obj.dtypes.iloc[loc] + if dtype == np.void: + # This means we're expanding, with multiple columns, e.g. + # df = pd.DataFrame({'A': [1,2,3], 'B': [4,5,6]}) + # df.loc[df.index <= 2, ['F', 'G']] = (1, 'abc') + # Columns F and G will initially be set to np.void. + # Here, we replace those temporary `np.void` columns with + # columns of the appropriate dtype, based on `value`. + self.obj.iloc[:, loc] = construct_1d_array_from_inferred_fill_value( + value, len(self.obj) + ) self.obj._mgr.column_setitem(loc, plane_indexer, value) self.obj._clear_item_cache() diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 99233d3cd4cf3..a58dd701f0f22 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -1401,3 +1401,19 @@ def test_full_setter_loc_incompatible_dtype(): df.loc[:, "a"] = {0: 3, 1: 4} expected = DataFrame({"a": [3, 4]}) tm.assert_frame_equal(df, expected) + + +def test_setitem_partial_row_multiple_columns(): + # https://github.com/pandas-dev/pandas/issues/56503 + df = DataFrame({"A": [1, 2, 3], "B": [4.0, 5, 6]}) + # should not warn + df.loc[df.index <= 1, ["F", "G"]] = (1, "abc") + expected = DataFrame( + { + "A": [1, 2, 3], + "B": [4.0, 5, 6], + "F": [1.0, 1, float("nan")], + "G": ["abc", "abc", float("nan")], + } + ) + tm.assert_frame_equal(df, expected) From 32d2b9912fc6292778f3d0ba67b928dbcf9e401d Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 17 Feb 2024 23:33:12 +0100 Subject: [PATCH 43/60] Backport PR #57311 on branch 2.2.x (Fixing multi method for to_sql for non-oracle databases) (#57466) Backport PR #57311: Fixing multi method for to_sql for non-oracle databases Co-authored-by: Samuel Chai <121340503+kassett@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/generic.py | 3 +++ pandas/io/sql.py | 11 ++++------- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 5e814ec2a1b92..ca4fef4f57fb6 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -31,6 +31,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.sort_index` not producing a stable sort for a index with duplicates (:issue:`57151`) - Fixed regression in :meth:`DataFrame.to_dict` with ``orient='list'`` and datetime or timedelta types returning integers (:issue:`54824`) - Fixed regression in :meth:`DataFrame.to_json` converting nullable integers to floats (:issue:`57224`) +- Fixed regression in :meth:`DataFrame.to_sql` when ``method="multi"`` is passed and the dialect type is not Oracle (:issue:`57310`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) - Fixed regression in :meth:`ExtensionArray.to_numpy` raising for non-numeric masked dtypes (:issue:`56991`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 55693d4cdb753..62dd0c3bf0161 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2969,6 +2969,9 @@ def to_sql( database. Otherwise, the datetimes will be stored as timezone unaware timestamps local to the original timezone. + Not all datastores support ``method="multi"``. Oracle, for example, + does not support multi-value insert. + References ---------- .. [1] https://docs.sqlalchemy.org diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 3a58daf681cfb..195a7c5040853 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -1012,22 +1012,19 @@ def _execute_insert(self, conn, keys: list[str], data_iter) -> int: def _execute_insert_multi(self, conn, keys: list[str], data_iter) -> int: """ - Alternative to _execute_insert for DBs support multivalue INSERT. + Alternative to _execute_insert for DBs support multi-value INSERT. Note: multi-value insert is usually faster for analytics DBs and tables containing a few columns but performance degrades quickly with increase of columns. + """ from sqlalchemy import insert data = [dict(zip(keys, row)) for row in data_iter] - stmt = insert(self.table) - # conn.execute is used here to ensure compatibility with Oracle. - # Using stmt.values(data) would produce a multi row insert that - # isn't supported by Oracle. - # see: https://docs.sqlalchemy.org/en/20/core/dml.html#sqlalchemy.sql.expression.Insert.values - result = conn.execute(stmt, data) + stmt = insert(self.table).values(data) + result = conn.execute(stmt) return result.rowcount def insert_data(self) -> tuple[list[str], list[np.ndarray]]: From ab8541ce51b70a8be728e8a15d78f7a5f82b62de Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 18 Feb 2024 13:35:55 +0100 Subject: [PATCH 44/60] Backport PR #57454 on branch 2.2.x (Release the gil in take for axis=1) (#57484) Backport PR #57454: Release the gil in take for axis=1 Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/_libs/algos_take_helper.pxi.in | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pandas/_libs/algos_take_helper.pxi.in b/pandas/_libs/algos_take_helper.pxi.in index 88c3abba506a3..385727fad3c50 100644 --- a/pandas/_libs/algos_take_helper.pxi.in +++ b/pandas/_libs/algos_take_helper.pxi.in @@ -184,6 +184,17 @@ def take_2d_axis1_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, fv = fill_value + {{if c_type_in == c_type_out != "object"}} + with nogil: + for i in range(n): + for j in range(k): + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + {{else}} for i in range(n): for j in range(k): idx = indexer[j] @@ -195,6 +206,7 @@ def take_2d_axis1_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, {{else}} out[i, j] = values[i, idx] {{endif}} + {{endif}} @cython.wraparound(False) From b79fe7e19db5931a4f6165e48b04a514d235e1a8 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Sun, 18 Feb 2024 10:13:56 -0500 Subject: [PATCH 45/60] REGR: DataFrame.update emits spurious warning about downcasting (#57485) --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/frame.py | 13 ++++++++++++- pandas/tests/frame/methods/test_update.py | 12 +++++++----- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index ca4fef4f57fb6..8c8db76ccc3d0 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -32,6 +32,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.to_dict` with ``orient='list'`` and datetime or timedelta types returning integers (:issue:`54824`) - Fixed regression in :meth:`DataFrame.to_json` converting nullable integers to floats (:issue:`57224`) - Fixed regression in :meth:`DataFrame.to_sql` when ``method="multi"`` is passed and the dialect type is not Oracle (:issue:`57310`) +- Fixed regression in :meth:`DataFrame.update` emitting incorrect warnings about downcasting (:issue:`57124`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) - Fixed regression in :meth:`ExtensionArray.to_numpy` raising for non-numeric masked dtypes (:issue:`56991`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b531ddc418df1..c09989fe87fb0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8962,6 +8962,7 @@ def update( 1 2 500.0 2 3 6.0 """ + if not PYPY and using_copy_on_write(): if sys.getrefcount(self) <= REF_COUNT: warnings.warn( @@ -9010,7 +9011,17 @@ def update( if mask.all(): continue - self.loc[:, col] = self[col].where(mask, that) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message="Downcasting behavior", + category=FutureWarning, + ) + # GH#57124 - `that` might get upcasted because of NA values, and then + # downcasted in where because of the mask. Ignoring the warning + # is a stopgap, will replace with a new implementation of update + # in 3.0. + self.loc[:, col] = self[col].where(mask, that) # ---------------------------------------------------------------------- # Data reshaping diff --git a/pandas/tests/frame/methods/test_update.py b/pandas/tests/frame/methods/test_update.py index 20ba550beeb30..8af1798aa8e00 100644 --- a/pandas/tests/frame/methods/test_update.py +++ b/pandas/tests/frame/methods/test_update.py @@ -48,16 +48,18 @@ def test_update(self): def test_update_dtypes(self): # gh 3016 df = DataFrame( - [[1.0, 2.0, False, True], [4.0, 5.0, True, False]], - columns=["A", "B", "bool1", "bool2"], + [[1.0, 2.0, 1, False, True], [4.0, 5.0, 2, True, False]], + columns=["A", "B", "int", "bool1", "bool2"], ) - other = DataFrame([[45, 45]], index=[0], columns=["A", "B"]) + other = DataFrame( + [[45, 45, 3, True]], index=[0], columns=["A", "B", "int", "bool1"] + ) df.update(other) expected = DataFrame( - [[45.0, 45.0, False, True], [4.0, 5.0, True, False]], - columns=["A", "B", "bool1", "bool2"], + [[45.0, 45.0, 3, True, True], [4.0, 5.0, 2, True, False]], + columns=["A", "B", "int", "bool1", "bool2"], ) tm.assert_frame_equal(df, expected) From dfc66f6e5da81f8e7cfc8ff5b4e557ac34f80a47 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Mon, 19 Feb 2024 04:51:13 -0500 Subject: [PATCH 46/60] Backport PR #57474 on branch 2.2.x (REGR: DataFrame.transpose resulting in not contiguous data on nullable EAs) (#57496) Backport PR #57474: REGR: DataFrame.transpose resulting in not contiguous data on nullable EAs --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/arrays/masked.py | 17 ++++++++++++++--- pandas/tests/frame/methods/test_transpose.py | 17 +++++++++++++++++ 3 files changed, 32 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 8c8db76ccc3d0..e1da0af59b3b8 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -32,6 +32,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.to_dict` with ``orient='list'`` and datetime or timedelta types returning integers (:issue:`54824`) - Fixed regression in :meth:`DataFrame.to_json` converting nullable integers to floats (:issue:`57224`) - Fixed regression in :meth:`DataFrame.to_sql` when ``method="multi"`` is passed and the dialect type is not Oracle (:issue:`57310`) +- Fixed regression in :meth:`DataFrame.transpose` with nullable extension dtypes not having F-contiguous data potentially causing exceptions when used (:issue:`57315`) - Fixed regression in :meth:`DataFrame.update` emitting incorrect warnings about downcasting (:issue:`57124`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 0bc0d9f8d7a7d..320d8cb10b8c2 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -1621,13 +1621,24 @@ def transpose_homogeneous_masked_arrays( same dtype. The caller is responsible for ensuring validity of input data. """ masked_arrays = list(masked_arrays) + dtype = masked_arrays[0].dtype + values = [arr._data.reshape(1, -1) for arr in masked_arrays] - transposed_values = np.concatenate(values, axis=0) + transposed_values = np.concatenate( + values, + axis=0, + out=np.empty( + (len(masked_arrays), len(masked_arrays[0])), + order="F", + dtype=dtype.numpy_dtype, + ), + ) masks = [arr._mask.reshape(1, -1) for arr in masked_arrays] - transposed_masks = np.concatenate(masks, axis=0) + transposed_masks = np.concatenate( + masks, axis=0, out=np.empty_like(transposed_values, dtype=bool) + ) - dtype = masked_arrays[0].dtype arr_type = dtype.construct_array_type() transposed_arrays: list[BaseMaskedArray] = [] for i in range(transposed_values.shape[1]): diff --git a/pandas/tests/frame/methods/test_transpose.py b/pandas/tests/frame/methods/test_transpose.py index d0caa071fae1c..3e74094f266d1 100644 --- a/pandas/tests/frame/methods/test_transpose.py +++ b/pandas/tests/frame/methods/test_transpose.py @@ -3,6 +3,7 @@ import pandas.util._test_decorators as td +import pandas as pd from pandas import ( DataFrame, DatetimeIndex, @@ -190,3 +191,19 @@ def test_transpose_not_inferring_dt_mixed_blocks(self): dtype=object, ) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("dtype1", ["Int64", "Float64"]) + @pytest.mark.parametrize("dtype2", ["Int64", "Float64"]) + def test_transpose(self, dtype1, dtype2): + # GH#57315 - transpose should have F contiguous blocks + df = DataFrame( + { + "a": pd.array([1, 1, 2], dtype=dtype1), + "b": pd.array([3, 4, 5], dtype=dtype2), + } + ) + result = df.T + for blk in result._mgr.blocks: + # When dtypes are unequal, we get NumPy object array + data = blk.values._data if dtype1 == dtype2 else blk.values + assert data.flags["F_CONTIGUOUS"] From 25df68eb19c7c783afe276762f3c0a3f03ac8706 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 19 Feb 2024 21:15:56 +0100 Subject: [PATCH 47/60] Backport PR #57486 on branch 2.2.x (CI: Run excel tests on single cpu for windows) (#57507) Backport PR #57486: CI: Run excel tests on single cpu for windows Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/tests/io/excel/test_odf.py | 5 +++++ pandas/tests/io/excel/test_odswriter.py | 5 +++++ pandas/tests/io/excel/test_openpyxl.py | 5 +++++ pandas/tests/io/excel/test_readers.py | 4 ++++ pandas/tests/io/excel/test_style.py | 4 ++++ pandas/tests/io/excel/test_writers.py | 4 ++++ pandas/tests/io/excel/test_xlrd.py | 5 +++++ pandas/tests/io/excel/test_xlsxwriter.py | 5 +++++ 8 files changed, 37 insertions(+) diff --git a/pandas/tests/io/excel/test_odf.py b/pandas/tests/io/excel/test_odf.py index f01827fa4ca2f..b5bb9b27258d8 100644 --- a/pandas/tests/io/excel/test_odf.py +++ b/pandas/tests/io/excel/test_odf.py @@ -3,11 +3,16 @@ import numpy as np import pytest +from pandas.compat import is_platform_windows + import pandas as pd import pandas._testing as tm pytest.importorskip("odf") +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + @pytest.fixture(autouse=True) def cd_and_set_engine(monkeypatch, datapath): diff --git a/pandas/tests/io/excel/test_odswriter.py b/pandas/tests/io/excel/test_odswriter.py index 271353a173d2a..1c728ad801bc1 100644 --- a/pandas/tests/io/excel/test_odswriter.py +++ b/pandas/tests/io/excel/test_odswriter.py @@ -6,6 +6,8 @@ import pytest +from pandas.compat import is_platform_windows + import pandas as pd import pandas._testing as tm @@ -13,6 +15,9 @@ odf = pytest.importorskip("odf") +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + @pytest.fixture def ext(): diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index 2df9ec9e53516..e53b5830ec6a4 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas.compat import is_platform_windows + import pandas as pd from pandas import DataFrame import pandas._testing as tm @@ -17,6 +19,9 @@ openpyxl = pytest.importorskip("openpyxl") +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + @pytest.fixture def ext(): diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 15712f36da4ca..8da8535952dcf 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -18,6 +18,7 @@ from pandas._config import using_pyarrow_string_dtype +from pandas.compat import is_platform_windows import pandas.util._test_decorators as td import pandas as pd @@ -34,6 +35,9 @@ StringArray, ) +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + read_ext_params = [".xls", ".xlsx", ".xlsm", ".xlsb", ".ods"] engine_params = [ # Add any engines to test here diff --git a/pandas/tests/io/excel/test_style.py b/pandas/tests/io/excel/test_style.py index 3ca8637885639..89615172688d7 100644 --- a/pandas/tests/io/excel/test_style.py +++ b/pandas/tests/io/excel/test_style.py @@ -4,6 +4,7 @@ import numpy as np import pytest +from pandas.compat import is_platform_windows import pandas.util._test_decorators as td from pandas import ( @@ -20,6 +21,9 @@ # could compute styles and render to excel without jinja2, since there is no # 'template' file, but this needs the import error to delayed until render time. +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + def assert_equal_cell_styles(cell1, cell2): # TODO: should find a better way to check equality diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 8c003723c1c71..292eab2d88152 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -11,6 +11,7 @@ import numpy as np import pytest +from pandas.compat import is_platform_windows from pandas.compat._constants import PY310 from pandas.compat._optional import import_optional_dependency import pandas.util._test_decorators as td @@ -34,6 +35,9 @@ ) from pandas.io.excel._util import _writers +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + def get_exp_unit(path: str) -> str: return "ns" diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py index 6d5008ca9ee68..066393d91eead 100644 --- a/pandas/tests/io/excel/test_xlrd.py +++ b/pandas/tests/io/excel/test_xlrd.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas.compat import is_platform_windows + import pandas as pd import pandas._testing as tm @@ -11,6 +13,9 @@ xlrd = pytest.importorskip("xlrd") +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + @pytest.fixture(params=[".xls"]) def read_ext_xlrd(request): diff --git a/pandas/tests/io/excel/test_xlsxwriter.py b/pandas/tests/io/excel/test_xlsxwriter.py index 94f6bdfaf069c..529367761fc02 100644 --- a/pandas/tests/io/excel/test_xlsxwriter.py +++ b/pandas/tests/io/excel/test_xlsxwriter.py @@ -2,6 +2,8 @@ import pytest +from pandas.compat import is_platform_windows + from pandas import DataFrame import pandas._testing as tm @@ -9,6 +11,9 @@ xlsxwriter = pytest.importorskip("xlsxwriter") +if is_platform_windows(): + pytestmark = pytest.mark.single_cpu + @pytest.fixture def ext(): From 6766c92fbac05118374c3963ac4b59524d80cf82 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 19 Feb 2024 23:12:25 +0100 Subject: [PATCH 48/60] Backport PR #57490 on branch 2.2.x (DOC: Add a few deprecation notes) (#57509) Backport PR #57490: DOC: Add a few deprecation notes Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/core/dtypes/common.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 2245359fd8eac..df0251d141984 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -169,6 +169,9 @@ def is_sparse(arr) -> bool: """ Check whether an array-like is a 1-D pandas sparse array. + .. deprecated:: 2.1.0 + Use isinstance(dtype, pd.SparseDtype) instead. + Check that the one-dimensional array-like is a pandas sparse array. Returns True if it is a pandas sparse array, not another type of sparse array. @@ -295,6 +298,9 @@ def is_datetime64tz_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of a DatetimeTZDtype dtype. + .. deprecated:: 2.1.0 + Use isinstance(dtype, pd.DatetimeTZDtype) instead. + Parameters ---------- arr_or_dtype : array-like or dtype @@ -381,6 +387,9 @@ def is_period_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of the Period dtype. + .. deprecated:: 2.2.0 + Use isinstance(dtype, pd.Period) instead. + Parameters ---------- arr_or_dtype : array-like or dtype @@ -424,6 +433,9 @@ def is_interval_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of the Interval dtype. + .. deprecated:: 2.2.0 + Use isinstance(dtype, pd.IntervalDtype) instead. + Parameters ---------- arr_or_dtype : array-like or dtype @@ -470,6 +482,9 @@ def is_categorical_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of the Categorical dtype. + .. deprecated:: 2.2.0 + Use isinstance(dtype, pd.CategoricalDtype) instead. + Parameters ---------- arr_or_dtype : array-like or dtype From 2ae7a1057a1c64b4cddf94aa5fa439f5a14ea5d1 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 20 Feb 2024 02:20:20 +0100 Subject: [PATCH 49/60] Backport PR #57488 on branch 2.2.x (REGR: query raising for all NaT in object column) (#57515) Backport PR #57488: REGR: query raising for all NaT in object column Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/core/generic.py | 2 +- pandas/tests/frame/test_query_eval.py | 8 ++++++++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index e1da0af59b3b8..6c6a37b2b2f89 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -27,6 +27,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.loc` which was unnecessarily throwing "incompatible dtype warning" when expanding with partial row indexer and multiple columns (see `PDEP6 `_) (:issue:`56503`) - Fixed regression in :meth:`DataFrame.map` with ``na_action="ignore"`` not being respected for NumPy nullable and :class:`ArrowDtypes` (:issue:`57316`) - Fixed regression in :meth:`DataFrame.merge` raising ``ValueError`` for certain types of 3rd-party extension arrays (:issue:`57316`) +- Fixed regression in :meth:`DataFrame.query` with all ``NaT`` column with object dtype (:issue:`57068`) - Fixed regression in :meth:`DataFrame.shift` raising ``AssertionError`` for ``axis=1`` and empty :class:`DataFrame` (:issue:`57301`) - Fixed regression in :meth:`DataFrame.sort_index` not producing a stable sort for a index with duplicates (:issue:`57151`) - Fixed regression in :meth:`DataFrame.to_dict` with ``orient='list'`` and datetime or timedelta types returning integers (:issue:`54824`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 62dd0c3bf0161..0ec17173287f5 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -657,7 +657,7 @@ def _get_cleaned_column_resolvers(self) -> dict[Hashable, Series]: return { clean_column_name(k): Series( - v, copy=False, index=self.index, name=k + v, copy=False, index=self.index, name=k, dtype=self.dtypes[k] ).__finalize__(self) for k, v in zip(self.columns, self._iter_column_arrays()) if not isinstance(k, int) diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index a498296e09c52..2c807c72582c5 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -1415,3 +1415,11 @@ def test_query_ea_equality_comparison(self, dtype, engine): } ) tm.assert_frame_equal(result, expected) + + def test_all_nat_in_object(self): + # GH#57068 + now = pd.Timestamp.now("UTC") # noqa: F841 + df = DataFrame({"a": pd.to_datetime([None, None], utc=True)}, dtype=object) + result = df.query("a > @now") + expected = DataFrame({"a": []}, dtype=object) + tm.assert_frame_equal(result, expected) From 9b1ce062d3985e7556fc7fa6f9d4c6bc184576eb Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 20 Feb 2024 19:51:27 +0100 Subject: [PATCH 50/60] Backport PR #57489 on branch 2.2.x (REGR: astype introducing decimals when casting from int with na to string) (#57533) Backport PR #57489: REGR: astype introducing decimals when casting from int with na to string Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/_libs/lib.pyx | 2 +- pandas/tests/series/methods/test_astype.py | 8 ++++++++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 6c6a37b2b2f89..85b9346aa01a5 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -39,6 +39,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) - Fixed regression in :meth:`ExtensionArray.to_numpy` raising for non-numeric masked dtypes (:issue:`56991`) - Fixed regression in :meth:`Index.join` raising ``TypeError`` when joining an empty index to a non-empty index containing mixed dtype values (:issue:`57048`) +- Fixed regression in :meth:`Series.astype` introducing decimals when converting from integer with missing values to string dtype (:issue:`57418`) - Fixed regression in :meth:`Series.pct_change` raising a ``ValueError`` for an empty :class:`Series` (:issue:`57056`) - Fixed regression in :meth:`Series.to_numpy` when dtype is given as float and the data contains NaNs (:issue:`57121`) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index c483f35513a40..7656e8d986117 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -759,7 +759,7 @@ cpdef ndarray[object] ensure_string_array( out = arr.astype(str).astype(object) out[arr.isna()] = na_value return out - arr = arr.to_numpy() + arr = arr.to_numpy(dtype=object) elif not util.is_array(arr): arr = np.array(arr, dtype="object") diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index 46f55fff91e41..4c8028e74ee55 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -673,3 +673,11 @@ def test_astype_timedelta64_with_np_nan(self): result = Series([Timedelta(1), np.nan], dtype="timedelta64[ns]") expected = Series([Timedelta(1), NaT], dtype="timedelta64[ns]") tm.assert_series_equal(result, expected) + + @td.skip_if_no("pyarrow") + def test_astype_int_na_string(self): + # GH#57418 + ser = Series([12, NA], dtype="Int64[pyarrow]") + result = ser.astype("string[pyarrow]") + expected = Series(["12", NA], dtype="string[pyarrow]") + tm.assert_series_equal(result, expected) From 0b49cf35bdc025e69ef104d59660cb08b973fec5 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 20 Feb 2024 22:32:29 +0100 Subject: [PATCH 51/60] Backport PR #57536 on branch 2.2.x (BUG: dt64 + DateOffset with milliseconds) (#57537) Backport PR #57536: BUG: dt64 + DateOffset with milliseconds Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/_libs/tslibs/offsets.pyx | 15 +++++++++- pandas/tests/arithmetic/test_datetime64.py | 32 ++++++++++++++++++++++ 3 files changed, 47 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 85b9346aa01a5..ab62a7c7598d5 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -42,6 +42,7 @@ Fixed regressions - Fixed regression in :meth:`Series.astype` introducing decimals when converting from integer with missing values to string dtype (:issue:`57418`) - Fixed regression in :meth:`Series.pct_change` raising a ``ValueError`` for an empty :class:`Series` (:issue:`57056`) - Fixed regression in :meth:`Series.to_numpy` when dtype is given as float and the data contains NaNs (:issue:`57121`) +- Fixed regression in addition or subtraction of :class:`DateOffset` objects with millisecond components to ``datetime64`` :class:`Index`, :class:`Series`, or :class:`DataFrame` (:issue:`57529`) .. --------------------------------------------------------------------------- .. _whatsnew_221.bug_fixes: diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 70e1ca1c4012a..c37a4b285daef 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -1458,13 +1458,22 @@ cdef class RelativeDeltaOffset(BaseOffset): "minutes", "seconds", "microseconds", + "milliseconds", } # relativedelta/_offset path only valid for base DateOffset if self._use_relativedelta and set(kwds).issubset(relativedelta_fast): + td_args = { + "days", + "hours", + "minutes", + "seconds", + "microseconds", + "milliseconds" + } td_kwds = { key: val for key, val in kwds.items() - if key in ["days", "hours", "minutes", "seconds", "microseconds"] + if key in td_args } if "weeks" in kwds: days = td_kwds.get("days", 0) @@ -1474,6 +1483,8 @@ cdef class RelativeDeltaOffset(BaseOffset): delta = Timedelta(**td_kwds) if "microseconds" in kwds: delta = delta.as_unit("us") + elif "milliseconds" in kwds: + delta = delta.as_unit("ms") else: delta = delta.as_unit("s") else: @@ -1491,6 +1502,8 @@ cdef class RelativeDeltaOffset(BaseOffset): delta = Timedelta(self._offset * self.n) if "microseconds" in kwds: delta = delta.as_unit("us") + elif "milliseconds" in kwds: + delta = delta.as_unit("ms") else: delta = delta.as_unit("s") return delta diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index dbff88dc6f4f6..a468449efd507 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -1586,6 +1586,38 @@ def test_dti_add_sub_nonzero_mth_offset( expected = tm.box_expected(expected, box_with_array, False) tm.assert_equal(result, expected) + def test_dt64arr_series_add_DateOffset_with_milli(self): + # GH 57529 + dti = DatetimeIndex( + [ + "2000-01-01 00:00:00.012345678", + "2000-01-31 00:00:00.012345678", + "2000-02-29 00:00:00.012345678", + ], + dtype="datetime64[ns]", + ) + result = dti + DateOffset(milliseconds=4) + expected = DatetimeIndex( + [ + "2000-01-01 00:00:00.016345678", + "2000-01-31 00:00:00.016345678", + "2000-02-29 00:00:00.016345678", + ], + dtype="datetime64[ns]", + ) + tm.assert_index_equal(result, expected) + + result = dti + DateOffset(days=1, milliseconds=4) + expected = DatetimeIndex( + [ + "2000-01-02 00:00:00.016345678", + "2000-02-01 00:00:00.016345678", + "2000-03-01 00:00:00.016345678", + ], + dtype="datetime64[ns]", + ) + tm.assert_index_equal(result, expected) + class TestDatetime64OverflowHandling: # TODO: box + de-duplicate From c101d3022b0a6b3a2375db64111c3f6fe516acf3 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 20 Feb 2024 22:32:46 +0100 Subject: [PATCH 52/60] Backport PR #57510 on branch 2.2.x (DOC: Fix xarray example) (#57538) DOC: Fix xarray example (#57510) * DOC: Fix xarray example * Update * Skip doctest * Igore another doctest --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> (cherry picked from commit 33a1cd7163ce712e5a38fdb5d2e04de203b3ddf9) --- pandas/core/generic.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0ec17173287f5..1dd471a09f1b1 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3278,18 +3278,18 @@ def to_xarray(self): 2 lion mammal 80.5 4 3 monkey mammal NaN 4 - >>> df.to_xarray() + >>> df.to_xarray() # doctest: +SKIP Dimensions: (index: 4) Coordinates: - * index (index) int64 0 1 2 3 + * index (index) int64 32B 0 1 2 3 Data variables: - name (index) object 'falcon' 'parrot' 'lion' 'monkey' - class (index) object 'bird' 'bird' 'mammal' 'mammal' - max_speed (index) float64 389.0 24.0 80.5 nan - num_legs (index) int64 2 2 4 4 + name (index) object 32B 'falcon' 'parrot' 'lion' 'monkey' + class (index) object 32B 'bird' 'bird' 'mammal' 'mammal' + max_speed (index) float64 32B 389.0 24.0 80.5 nan + num_legs (index) int64 32B 2 2 4 4 - >>> df['max_speed'].to_xarray() + >>> df['max_speed'].to_xarray() # doctest: +SKIP array([389. , 24. , 80.5, nan]) Coordinates: @@ -3311,7 +3311,7 @@ class (index) object 'bird' 'bird' 'mammal' 'mammal' 2018-01-02 falcon 361 parrot 15 - >>> df_multiindex.to_xarray() + >>> df_multiindex.to_xarray() # doctest: +SKIP Dimensions: (date: 2, animal: 2) Coordinates: From 3a4033c7296a665fcbbbcd49a05a420611fa3f3c Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 21 Feb 2024 21:46:10 +0100 Subject: [PATCH 53/60] Backport PR #57439 on branch 2.2.x (BUG: read_json returning Index instead of RangeIndex) (#57552) Backport PR #57439: BUG: read_json returning Index instead of RangeIndex Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 1 + pandas/io/json/_json.py | 25 ++++++++++++++----------- pandas/tests/io/json/test_pandas.py | 16 ++++++++++++++-- 3 files changed, 29 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index ab62a7c7598d5..d81032fafa730 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -18,6 +18,7 @@ Fixed regressions - Fixed regression in :func:`concat` changing long-standing behavior that always sorted the non-concatenation axis when the axis was a :class:`DatetimeIndex` (:issue:`57006`) - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) - Fixed regression in :func:`pandas.testing.assert_series_equal` defaulting to ``check_exact=True`` when checking the :class:`Index` (:issue:`57067`) +- Fixed regression in :func:`read_json` where an :class:`Index` would be returned instead of a :class:`RangeIndex` (:issue:`57429`) - Fixed regression in :func:`wide_to_long` raising an ``AttributeError`` for string columns (:issue:`57066`) - Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`) - Fixed regression in :meth:`.DataFrameGroupBy.idxmin`, :meth:`.DataFrameGroupBy.idxmax`, :meth:`.SeriesGroupBy.idxmin`, :meth:`.SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 4c490c6b2cda2..9414f45215029 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -1266,6 +1266,7 @@ def _try_convert_data( if result: return new_data, True + converted = False if self.dtype_backend is not lib.no_default and not is_axis: # Fall through for conversion later on return data, True @@ -1273,16 +1274,17 @@ def _try_convert_data( # try float try: data = data.astype("float64") + converted = True except (TypeError, ValueError): pass - if data.dtype.kind == "f": - if data.dtype != "float64": - # coerce floats to 64 - try: - data = data.astype("float64") - except (TypeError, ValueError): - pass + if data.dtype.kind == "f" and data.dtype != "float64": + # coerce floats to 64 + try: + data = data.astype("float64") + converted = True + except (TypeError, ValueError): + pass # don't coerce 0-len data if len(data) and data.dtype in ("float", "object"): @@ -1291,14 +1293,15 @@ def _try_convert_data( new_data = data.astype("int64") if (new_data == data).all(): data = new_data + converted = True except (TypeError, ValueError, OverflowError): pass - # coerce ints to 64 - if data.dtype == "int": - # coerce floats to 64 + if data.dtype == "int" and data.dtype != "int64": + # coerce ints to 64 try: data = data.astype("int64") + converted = True except (TypeError, ValueError): pass @@ -1307,7 +1310,7 @@ def _try_convert_data( if self.orient == "split": return data, False - return data, True + return data, converted @final def _try_convert_to_date(self, data: Series) -> tuple[Series, bool]: diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index caa25841d3596..5279f3f1cdfbe 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -24,6 +24,7 @@ DataFrame, DatetimeIndex, Index, + RangeIndex, Series, Timestamp, date_range, @@ -493,12 +494,12 @@ def test_frame_mixedtype_orient(self): # GH10289 left = read_json(inp, orient=orient, convert_axes=False) tm.assert_frame_equal(left, right) - right.index = pd.RangeIndex(len(df)) + right.index = RangeIndex(len(df)) inp = StringIO(df.to_json(orient="records")) left = read_json(inp, orient="records", convert_axes=False) tm.assert_frame_equal(left, right) - right.columns = pd.RangeIndex(df.shape[1]) + right.columns = RangeIndex(df.shape[1]) inp = StringIO(df.to_json(orient="values")) left = read_json(inp, orient="values", convert_axes=False) tm.assert_frame_equal(left, right) @@ -2188,3 +2189,14 @@ def test_to_json_ea_null(): {"a":null,"b":null} """ assert result == expected + + +def test_read_json_lines_rangeindex(): + # GH 57429 + data = """ +{"a": 1, "b": 2} +{"a": 3, "b": 4} +""" + result = read_json(StringIO(data), lines=True).index + expected = RangeIndex(2) + tm.assert_index_equal(result, expected, exact=True) From 3bfedfef7401f6af7e551b9fd2182442b70f0409 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 21 Feb 2024 13:49:28 -1000 Subject: [PATCH 54/60] Backport PR #57551: BLD: Add pyarrow extra for pip installation (#57557) --- .github/workflows/package-checks.yml | 2 +- doc/source/whatsnew/v2.2.1.rst | 6 ++++++ pyproject.toml | 1 + 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/.github/workflows/package-checks.yml b/.github/workflows/package-checks.yml index d59ddf272f705..7c1da5678a2aa 100644 --- a/.github/workflows/package-checks.yml +++ b/.github/workflows/package-checks.yml @@ -24,7 +24,7 @@ jobs: runs-on: ubuntu-22.04 strategy: matrix: - extra: ["test", "performance", "computation", "fss", "aws", "gcp", "excel", "parquet", "feather", "hdf5", "spss", "postgresql", "mysql", "sql-other", "html", "xml", "plot", "output-formatting", "clipboard", "compression", "consortium-standard", "all"] + extra: ["test", "pyarrow", "performance", "computation", "fss", "aws", "gcp", "excel", "parquet", "feather", "hdf5", "spss", "postgresql", "mysql", "sql-other", "html", "xml", "plot", "output-formatting", "clipboard", "compression", "consortium-standard", "all"] fail-fast: false name: Install Extras - ${{ matrix.extra }} concurrency: diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index d81032fafa730..e381f9df16383 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -9,6 +9,12 @@ including other versions of pandas. {{ header }} .. --------------------------------------------------------------------------- +.. _whatsnew_221.enhancements: + +Enhancements +~~~~~~~~~~~~ +- Added ``pyarrow`` pip extra so users can install pandas and pyarrow with pip with ``pip install pandas[pyarrow]`` (:issue:`54466`) + .. _whatsnew_221.regressions: Fixed regressions diff --git a/pyproject.toml b/pyproject.toml index 8c9a79aa2b059..c225ed80dcb10 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,6 +64,7 @@ matplotlib = "pandas:plotting._matplotlib" [project.optional-dependencies] test = ['hypothesis>=6.46.1', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0'] +pyarrow = ['pyarrow>=10.0.1'] performance = ['bottleneck>=1.3.6', 'numba>=0.56.4', 'numexpr>=2.8.4'] computation = ['scipy>=1.10.0', 'xarray>=2022.12.0'] fss = ['fsspec>=2022.11.0'] From ea56e0cd720b8a4e1c26df7387e8ccb4badf6eb2 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 22 Feb 2024 19:45:36 +0100 Subject: [PATCH 55/60] Backport PR #57556 on branch 2.2.x (Remove PyArrow deprecation warning) (#57568) --- .github/workflows/unit-tests.yml | 5 +---- doc/source/whatsnew/v2.2.1.rst | 10 ++++++++++ pandas/__init__.py | 31 +------------------------------ pandas/compat/pyarrow.py | 2 -- pandas/tests/test_common.py | 25 ------------------------- 5 files changed, 12 insertions(+), 61 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 4c7aa1e1e49ee..8736674bbf965 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -92,10 +92,7 @@ jobs: - name: "Numpy Dev" env_file: actions-311-numpydev.yaml pattern: "not slow and not network and not single_cpu" - # Currently restricted the warnings that error to Deprecation Warnings from numpy - # done since pyarrow isn't compatible with numpydev always - # TODO: work with pyarrow to revert this? - test_args: "-W error::DeprecationWarning:numpy -W error::FutureWarning:numpy" + test_args: "-W error::DeprecationWarning -W error::FutureWarning" - name: "Pyarrow Nightly" env_file: actions-311-pyarrownightly.yaml pattern: "not slow and not network and not single_cpu" diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index e381f9df16383..35d64c99f2002 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -67,6 +67,16 @@ Bug fixes Other ~~~~~ + +.. note:: + + The ``DeprecationWarning`` that was raised when pandas was imported without PyArrow being + installed has been removed. This decision was made because the warning was too noisy for too + many users and a lot of feedback was collected about the decision to make PyArrow a required + dependency. Pandas is currently considering the decision whether or not PyArrow should be added + as a hard dependency in 3.0. Interested users can follow the discussion + `here `_. + - Added the argument ``skipna`` to :meth:`DataFrameGroupBy.first`, :meth:`DataFrameGroupBy.last`, :meth:`SeriesGroupBy.first`, and :meth:`SeriesGroupBy.last`; achieving ``skipna=False`` used to be available via :meth:`DataFrameGroupBy.nth`, but the behavior was changed in pandas 2.0.0 (:issue:`57019`) - Added the argument ``skipna`` to :meth:`Resampler.first`, :meth:`Resampler.last` (:issue:`57019`) diff --git a/pandas/__init__.py b/pandas/__init__.py index ed524c2bb3619..ca2eba2043292 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -203,36 +203,7 @@ stacklevel=2, ) -# DeprecationWarning for missing pyarrow -from pandas.compat.pyarrow import pa_version_under10p1, pa_not_found - -if pa_version_under10p1: - # pyarrow is either too old or nonexistent, warn - from pandas.compat._optional import VERSIONS - - if pa_not_found: - pa_msg = "was not found to be installed on your system." - else: - pa_msg = ( - f"was too old on your system - pyarrow {VERSIONS['pyarrow']} " - "is the current minimum supported version as of this release." - ) - - warnings.warn( - f""" -Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0), -(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries) -but {pa_msg} -If this would cause problems for you, -please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466 - """, # noqa: E501 - DeprecationWarning, - stacklevel=2, - ) - del VERSIONS, pa_msg - -# Delete all unnecessary imported modules -del pa_version_under10p1, pa_not_found, warnings, os +del warnings, os # module level doc-string __doc__ = """ diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index 2e151123ef2c9..beb4814914101 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -8,7 +8,6 @@ import pyarrow as pa _palv = Version(Version(pa.__version__).base_version) - pa_not_found = False pa_version_under10p1 = _palv < Version("10.0.1") pa_version_under11p0 = _palv < Version("11.0.0") pa_version_under12p0 = _palv < Version("12.0.0") @@ -17,7 +16,6 @@ pa_version_under14p1 = _palv < Version("14.0.1") pa_version_under15p0 = _palv < Version("15.0.0") except ImportError: - pa_not_found = True pa_version_under10p1 = True pa_version_under11p0 = True pa_version_under12p0 = True diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 4af71be11fe6b..e8a1c961c8cb6 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -8,8 +8,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd from pandas import Series import pandas._testing as tm @@ -267,26 +265,3 @@ def test_bz2_missing_import(): code = textwrap.dedent(code) call = [sys.executable, "-c", code] subprocess.check_output(call) - - -@td.skip_if_installed("pyarrow") -@pytest.mark.parametrize("module", ["pandas", "pandas.arrays"]) -def test_pyarrow_missing_warn(module): - # GH56896 - response = subprocess.run( - [sys.executable, "-c", f"import {module}"], - capture_output=True, - check=True, - ) - msg = """ -Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0), -(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries) -but was not found to be installed on your system. -If this would cause problems for you, -please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466 -""" # noqa: E501 - stderr_msg = response.stderr.decode("utf-8") - # Split by \n to avoid \r\n vs \n differences on Windows/Unix - # https://stackoverflow.com/questions/11989501/replacing-r-n-with-n - stderr_msg = "\n".join(stderr_msg.splitlines()) - assert msg in stderr_msg From 5521dc98f9fe73cc3a0adbbf61a2e15f3ae0736e Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 23 Feb 2024 01:38:10 +0100 Subject: [PATCH 56/60] Backport PR #57314 on branch 2.2.x (BUG: Fix near-minimum timestamp handling) (#57573) Backport PR #57314: BUG: Fix near-minimum timestamp handling Co-authored-by: Robert Schmidtke --- doc/source/whatsnew/v2.2.1.rst | 1 + .../src/vendored/numpy/datetime/np_datetime.c | 18 ++++++++++++++---- pandas/tests/tslibs/test_array_to_datetime.py | 17 +++++++++++++++++ 3 files changed, 32 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 35d64c99f2002..fa5304c1c3b56 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -21,6 +21,7 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed memory leak in :func:`read_csv` (:issue:`57039`) - Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`) +- Fixed regression causing overflow for near-minimum timestamps (:issue:`57150`) - Fixed regression in :func:`concat` changing long-standing behavior that always sorted the non-concatenation axis when the axis was a :class:`DatetimeIndex` (:issue:`57006`) - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`) - Fixed regression in :func:`pandas.testing.assert_series_equal` defaulting to ``check_exact=True`` when checking the :class:`Index` (:issue:`57067`) diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c index 06e3251db8315..277d01807f2f3 100644 --- a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c +++ b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c @@ -482,10 +482,20 @@ npy_datetime npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT base, if (base == NPY_FR_ns) { int64_t nanoseconds; - PD_CHECK_OVERFLOW( - scaleMicrosecondsToNanoseconds(microseconds, &nanoseconds)); - PD_CHECK_OVERFLOW( - checked_int64_add(nanoseconds, dts->ps / 1000, &nanoseconds)); + + // Minimum valid timestamp in nanoseconds (1677-09-21 00:12:43.145224193). + const int64_t min_nanoseconds = NPY_MIN_INT64 + 1; + if (microseconds == min_nanoseconds / 1000 - 1) { + // For values within one microsecond of min_nanoseconds, use it as base + // and offset it with nanosecond delta to avoid overflow during scaling. + PD_CHECK_OVERFLOW(checked_int64_add( + min_nanoseconds, (dts->ps - _NS_MIN_DTS.ps) / 1000, &nanoseconds)); + } else { + PD_CHECK_OVERFLOW( + scaleMicrosecondsToNanoseconds(microseconds, &nanoseconds)); + PD_CHECK_OVERFLOW( + checked_int64_add(nanoseconds, dts->ps / 1000, &nanoseconds)); + } return nanoseconds; } diff --git a/pandas/tests/tslibs/test_array_to_datetime.py b/pandas/tests/tslibs/test_array_to_datetime.py index 632d3b4cc3c84..82175c67764f8 100644 --- a/pandas/tests/tslibs/test_array_to_datetime.py +++ b/pandas/tests/tslibs/test_array_to_datetime.py @@ -296,6 +296,23 @@ def test_to_datetime_barely_out_of_bounds(): tslib.array_to_datetime(arr) +@pytest.mark.parametrize( + "timestamp", + [ + # Close enough to bounds that scaling micros to nanos overflows + # but adding nanos would result in an in-bounds datetime. + "1677-09-21T00:12:43.145224193", + "1677-09-21T00:12:43.145224999", + # this always worked + "1677-09-21T00:12:43.145225000", + ], +) +def test_to_datetime_barely_inside_bounds(timestamp): + # see gh-57150 + result, _ = tslib.array_to_datetime(np.array([timestamp], dtype=object)) + tm.assert_numpy_array_equal(result, np.array([timestamp], dtype="M8[ns]")) + + class SubDatetime(datetime): pass From bdbb17993cacf916648c1f3e0aa637f459346a4f Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 23 Feb 2024 05:47:35 +0100 Subject: [PATCH 57/60] Backport PR #57576 on branch 2.2.x (DOC: Add release date for 2.2.1) (#57579) Backport PR #57576: DOC: Add release date for 2.2.1 Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- doc/source/whatsnew/v2.2.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index fa5304c1c3b56..f4ed594613349 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -1,6 +1,6 @@ .. _whatsnew_221: -What's new in 2.2.1 (February XX, 2024) +What's new in 2.2.1 (February 22, 2024) --------------------------------------- These are the changes in pandas 2.2.1. See :ref:`release` for a full changelog From 541448e41cefbff27e45866ad173826fd65e807e Mon Sep 17 00:00:00 2001 From: Pandas Development Team Date: Fri, 23 Feb 2024 08:36:39 -0500 Subject: [PATCH 58/60] RLS: 2.2.1 From 470b886470f202e578c77952db8163e49f3efeab Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 23 Feb 2024 15:48:32 +0100 Subject: [PATCH 59/60] Backport PR #57582 on branch 2.2.x (DOC: Add contributors for 2.2.1) (#57583) Backport PR #57582: DOC: Add contributors for 2.2.1 Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- doc/source/whatsnew/v2.2.0.rst | 2 +- doc/source/whatsnew/v2.2.1.rst | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index d9ab0452c8334..e015afb17dce5 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -946,4 +946,4 @@ Other Contributors ~~~~~~~~~~~~ -.. contributors:: v2.1.4..v2.2.0|HEAD +.. contributors:: v2.1.4..v2.2.0 diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index f4ed594613349..310dd921e44f6 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -86,3 +86,5 @@ Other Contributors ~~~~~~~~~~~~ + +.. contributors:: v2.2.0..v2.2.1|HEAD From bdc79c146c2e32f2cab629be240f01658cfb6cc2 Mon Sep 17 00:00:00 2001 From: Pandas Development Team Date: Fri, 23 Feb 2024 09:49:27 -0500 Subject: [PATCH 60/60] RLS: 2.2.1