From ce5d1d506a6adeb8ac4a194c9cdfbb199adddb04 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 24 Apr 2023 18:57:47 +0200 Subject: [PATCH 01/41] Backport PR #52893 on branch 2.0.x (DOC: Add whatsnew for 2.0.2) (#52896) Backport PR #52893: DOC: Add whatsnew for 2.0.2 Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/index.rst | 1 + doc/source/whatsnew/v2.0.2.rst | 38 ++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) create mode 100644 doc/source/whatsnew/v2.0.2.rst diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index 6b12cd93ae387..f2d2a54e4d1fe 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -16,6 +16,7 @@ Version 2.0 .. toctree:: :maxdepth: 2 + v2.0.2 v2.0.1 v2.0.0 diff --git a/doc/source/whatsnew/v2.0.2.rst b/doc/source/whatsnew/v2.0.2.rst new file mode 100644 index 0000000000000..0a6738cb9b3dc --- /dev/null +++ b/doc/source/whatsnew/v2.0.2.rst @@ -0,0 +1,38 @@ +.. _whatsnew_202: + +What's new in 2.0.2 (May ..., 2023) +----------------------------------- + +These are the changes in pandas 2.0.2. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- +.. _whatsnew_202.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_202.bug_fixes: + +Bug fixes +~~~~~~~~~ +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_202.other: + +Other +~~~~~ +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_202.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v2.0.1..v2.0.2|HEAD From 9fe8b3a2e10a2b59b7a724c02c4e853ede7bfff4 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 24 Apr 2023 22:37:44 +0200 Subject: [PATCH 02/41] Backport PR #52891 on branch 2.0.x (Bump pypa/cibuildwheel from 2.12.1 to 2.12.3) (#52900) Backport PR #52891: Bump pypa/cibuildwheel from 2.12.1 to 2.12.3 Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/wheels.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index f0d422b01e0c8..07f3224679e90 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -71,7 +71,7 @@ jobs: fetch-depth: 0 - name: Build wheels - uses: pypa/cibuildwheel@v2.12.1 + uses: pypa/cibuildwheel@v2.12.3 env: CIBW_BUILD: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }} From 8c3bbc7a017a0c77e49ab340d7c430fe48c0e577 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 24 Apr 2023 14:07:21 -0700 Subject: [PATCH 03/41] Backport PR #52694 on branch 2.0.x (CI: Combine unit test workflows) (#52901) CI: Combine unit test workflows (#52694) (cherry picked from commit c9c6a1f05c657b1b4159d2f47be02e09448c1e78) Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- .github/workflows/32-bit-linux.yml | 58 ----- .github/workflows/macos-windows.yml | 59 ----- .github/workflows/python-dev.yml | 95 -------- .github/workflows/ubuntu.yml | 167 -------------- .github/workflows/unit-tests.yml | 315 ++++++++++++++++++++++++++ pandas/tests/tools/test_to_numeric.py | 2 + 6 files changed, 317 insertions(+), 379 deletions(-) delete mode 100644 .github/workflows/32-bit-linux.yml delete mode 100644 .github/workflows/macos-windows.yml delete mode 100644 .github/workflows/python-dev.yml delete mode 100644 .github/workflows/ubuntu.yml create mode 100644 .github/workflows/unit-tests.yml diff --git a/.github/workflows/32-bit-linux.yml b/.github/workflows/32-bit-linux.yml deleted file mode 100644 index 08026a5fd637f..0000000000000 --- a/.github/workflows/32-bit-linux.yml +++ /dev/null @@ -1,58 +0,0 @@ -name: 32 Bit Linux - -on: - push: - branches: - - main - - 2.0.x - pull_request: - branches: - - main - - 2.0.x - paths-ignore: - - "doc/**" - -permissions: - contents: read - -jobs: - pytest: - runs-on: ubuntu-22.04 - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - fetch-depth: 0 - - - name: Run 32-bit manylinux2014 Docker Build / Tests - run: | - # Without this (line 34), versioneer will not be able to determine the pandas version. - # This is because of a security update to git that blocks it from reading the config folder if - # it is not owned by the current user. We hit this since the "mounted" folder is not hit by the - # Docker container. - # xref https://github.com/pypa/manylinux/issues/1309 - docker pull quay.io/pypa/manylinux2014_i686 - docker run --platform linux/386 -v $(pwd):/pandas quay.io/pypa/manylinux2014_i686 \ - /bin/bash -xc "cd pandas && \ - git config --global --add safe.directory /pandas && \ - /opt/python/cp38-cp38/bin/python -m venv ~/virtualenvs/pandas-dev && \ - . ~/virtualenvs/pandas-dev/bin/activate && \ - python -m pip install --no-deps -U pip wheel 'setuptools<60.0.0' && \ - python -m pip install versioneer[toml] && \ - python -m pip install cython numpy python-dateutil pytz pytest>=7.0.0 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.34.2 && \ - python setup.py build_ext -q -j1 && \ - python -m pip install --no-build-isolation --no-use-pep517 -e . && \ - python -m pip list && \ - export PANDAS_CI=1 && \ - pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml" - - - name: Publish test results for Python 3.8-32 bit full Linux - uses: actions/upload-artifact@v3 - with: - name: Test results - path: test-data.xml - if: failure() - concurrency: - # https://github.community/t/concurrecy-not-work-for-push/183068/7 - group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-32bit - cancel-in-progress: true diff --git a/.github/workflows/macos-windows.yml b/.github/workflows/macos-windows.yml deleted file mode 100644 index fd6560d61b160..0000000000000 --- a/.github/workflows/macos-windows.yml +++ /dev/null @@ -1,59 +0,0 @@ -name: Windows-macOS - -on: - push: - branches: - - main - - 2.0.x - pull_request: - branches: - - main - - 2.0.x - paths-ignore: - - "doc/**" - -env: - PANDAS_CI: 1 - PYTEST_TARGET: pandas - PATTERN: "not slow and not db and not network and not single_cpu" - -permissions: - contents: read - -jobs: - pytest: - defaults: - run: - shell: bash -el {0} - timeout-minutes: 180 - strategy: - matrix: - os: [macos-latest, windows-latest] - env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml, actions-311.yaml] - fail-fast: false - runs-on: ${{ matrix.os }} - name: ${{ format('{0} {1}', matrix.os, matrix.env_file) }} - concurrency: - # https://github.community/t/concurrecy-not-work-for-push/183068/7 - group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.os }} - cancel-in-progress: true - env: - # GH 47443: PYTEST_WORKERS > 1 crashes Windows builds with memory related errors - PYTEST_WORKERS: ${{ matrix.os == 'macos-latest' && 'auto' || '1' }} - - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - fetch-depth: 0 - - - name: Set up Conda - uses: ./.github/actions/setup-conda - with: - environment-file: ci/deps/${{ matrix.env_file }} - - - name: Build Pandas - uses: ./.github/actions/build_pandas - - - name: Test - uses: ./.github/actions/run-tests diff --git a/.github/workflows/python-dev.yml b/.github/workflows/python-dev.yml deleted file mode 100644 index 39b0439e2d6f4..0000000000000 --- a/.github/workflows/python-dev.yml +++ /dev/null @@ -1,95 +0,0 @@ -# This workflow may or may not run depending on the state of the next -# unreleased Python version. DO NOT DELETE IT. -# -# In general, this file will remain frozen(present, but not running) until: -# - The next unreleased Python version has released beta 1 -# - This version should be available on GitHub Actions. -# - Our required build/runtime dependencies(numpy, pytz, Cython, python-dateutil) -# support that unreleased Python version. -# To unfreeze, comment out the ``if: false`` condition, and make sure you update -# the name of the workflow and Python version in actions/setup-python to: '3.12-dev' -# -# After it has been unfrozen, this file should remain unfrozen(present, and running) until: -# - The next Python version has been officially released. -# OR -# - Most/All of our optional dependencies support Python 3.11 AND -# - The next Python version has released a rc(we are guaranteed a stable ABI). -# To freeze this file, uncomment out the ``if: false`` condition, and migrate the jobs -# to the corresponding posix/windows-macos/sdist etc. workflows. -# Feel free to modify this comment as necessary. - -name: Python Dev - -on: - push: - branches: - - main - - 2.0.x - - None - pull_request: - branches: - - main - - 2.0.x - - None - paths-ignore: - - "doc/**" - -env: - PYTEST_WORKERS: "auto" - PANDAS_CI: 1 - PATTERN: "not slow and not network and not clipboard and not single_cpu" - COVERAGE: true - PYTEST_TARGET: pandas - -permissions: - contents: read - -jobs: - build: - if: false # Uncomment this to freeze the workflow, comment it to unfreeze - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [ubuntu-22.04, macOS-latest, windows-latest] - - name: actions-311-dev - timeout-minutes: 120 - - concurrency: - #https://github.community/t/concurrecy-not-work-for-push/183068/7 - group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.os }}-${{ matrix.pytest_target }}-dev - cancel-in-progress: true - - steps: - - uses: actions/checkout@v3 - with: - fetch-depth: 0 - - - name: Set up Python Dev Version - uses: actions/setup-python@v4 - with: - python-version: '3.11-dev' - - - name: Install dependencies - run: | - python --version - python -m pip install --upgrade pip setuptools wheel - python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scipy-wheels-nightly/simple numpy - python -m pip install git+https://github.com/nedbat/coveragepy.git - python -m pip install versioneer[toml] - python -m pip install python-dateutil pytz cython hypothesis>=6.34.2 pytest>=7.0.0 pytest-xdist>=2.2.0 pytest-cov pytest-asyncio>=0.17 - python -m pip list - - # GH 47305: Parallel build can cause flaky ImportError from pandas/_libs/tslibs - - name: Build Pandas - run: | - python setup.py build_ext -q -j1 - python -m pip install -e . --no-build-isolation --no-use-pep517 --no-index - - - name: Build Version - run: | - python -c "import pandas; pandas.show_versions();" - - - name: Test - uses: ./.github/actions/run-tests diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml deleted file mode 100644 index 7bffd066b7e64..0000000000000 --- a/.github/workflows/ubuntu.yml +++ /dev/null @@ -1,167 +0,0 @@ -name: Ubuntu - -on: - push: - branches: - - main - - 2.0.x - pull_request: - branches: - - main - - 2.0.x - paths-ignore: - - "doc/**" - -permissions: - contents: read - -jobs: - pytest: - runs-on: ubuntu-22.04 - defaults: - run: - shell: bash -el {0} - timeout-minutes: 180 - strategy: - matrix: - env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml, actions-311.yaml] - # Prevent the include jobs from overriding other jobs - pattern: [""] - include: - - name: "Downstream Compat" - env_file: actions-38-downstream_compat.yaml - pattern: "not slow and not network and not single_cpu" - pytest_target: "pandas/tests/test_downstream.py" - - name: "Minimum Versions" - env_file: actions-38-minimum_versions.yaml - pattern: "not slow and not network and not single_cpu" - - name: "Locale: it_IT" - env_file: actions-38.yaml - pattern: "not slow and not network and not single_cpu" - extra_apt: "language-pack-it" - # Use the utf8 version as the default, it has no bad side-effect. - lang: "it_IT.utf8" - lc_all: "it_IT.utf8" - # Also install it_IT (its encoding is ISO8859-1) but do not activate it. - # It will be temporarily activated during tests with locale.setlocale - extra_loc: "it_IT" - - name: "Locale: zh_CN" - env_file: actions-38.yaml - pattern: "not slow and not network and not single_cpu" - extra_apt: "language-pack-zh-hans" - # Use the utf8 version as the default, it has no bad side-effect. - lang: "zh_CN.utf8" - lc_all: "zh_CN.utf8" - # Also install zh_CN (its encoding is gb2312) but do not activate it. - # It will be temporarily activated during tests with locale.setlocale - extra_loc: "zh_CN" - - name: "Copy-on-Write" - env_file: actions-310.yaml - pattern: "not slow and not network and not single_cpu" - pandas_copy_on_write: "1" - - name: "Pypy" - env_file: actions-pypy-38.yaml - pattern: "not slow and not network and not single_cpu" - test_args: "--max-worker-restart 0" - - name: "Numpy Dev" - env_file: actions-310-numpydev.yaml - pattern: "not slow and not network and not single_cpu" - test_args: "-W error::DeprecationWarning -W error::FutureWarning" - # TODO(cython3): Re-enable once next-beta(after beta 1) comes out - # There are some warnings failing the build with -werror - pandas_ci: "0" - - name: "Pyarrow Nightly" - env_file: actions-311-pyarrownightly.yaml - pattern: "not slow and not network and not single_cpu" - fail-fast: false - name: ${{ matrix.name || matrix.env_file }} - env: - ENV_FILE: ci/deps/${{ matrix.env_file }} - PATTERN: ${{ matrix.pattern }} - EXTRA_APT: ${{ matrix.extra_apt || '' }} - LANG: ${{ matrix.lang || '' }} - LC_ALL: ${{ matrix.lc_all || '' }} - PANDAS_COPY_ON_WRITE: ${{ matrix.pandas_copy_on_write || '0' }} - PANDAS_CI: ${{ matrix.pandas_ci || '1' }} - TEST_ARGS: ${{ matrix.test_args || '' }} - PYTEST_WORKERS: 'auto' - PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }} - concurrency: - # https://github.community/t/concurrecy-not-work-for-push/183068/7 - group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }} - cancel-in-progress: true - - services: - mysql: - image: mysql - env: - MYSQL_ALLOW_EMPTY_PASSWORD: yes - MYSQL_DATABASE: pandas - options: >- - --health-cmd "mysqladmin ping" - --health-interval 10s - --health-timeout 5s - --health-retries 5 - ports: - - 3306:3306 - - postgres: - image: postgres - env: - POSTGRES_USER: postgres - POSTGRES_PASSWORD: postgres - POSTGRES_DB: pandas - options: >- - --health-cmd pg_isready - --health-interval 10s - --health-timeout 5s - --health-retries 5 - ports: - - 5432:5432 - - moto: - image: motoserver/moto:4.1.4 - env: - AWS_ACCESS_KEY_ID: foobar_key - AWS_SECRET_ACCESS_KEY: foobar_secret - ports: - - 5000:5000 - - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - fetch-depth: 0 - - - name: Extra installs - # xsel for clipboard tests - run: sudo apt-get update && sudo apt-get install -y xsel ${{ env.EXTRA_APT }} - - - name: Generate extra locales - # These extra locales will be available for locale.setlocale() calls in tests - run: | - sudo locale-gen ${{ matrix.extra_loc }} - if: ${{ matrix.extra_loc }} - - - name: Set up Conda - uses: ./.github/actions/setup-conda - with: - environment-file: ${{ env.ENV_FILE }} - - - name: Build Pandas - id: build - uses: ./.github/actions/build_pandas - - - name: Test (not single_cpu) - uses: ./.github/actions/run-tests - if: ${{ matrix.name != 'Pypy' }} - env: - # Set pattern to not single_cpu if not already set - PATTERN: ${{ env.PATTERN == '' && 'not single_cpu' || matrix.pattern }} - - - name: Test (single_cpu) - uses: ./.github/actions/run-tests - env: - PATTERN: 'single_cpu' - PYTEST_WORKERS: 1 - if: ${{ matrix.pattern == '' && (always() && steps.build.outcome == 'success')}} diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml new file mode 100644 index 0000000000000..31e2095624347 --- /dev/null +++ b/.github/workflows/unit-tests.yml @@ -0,0 +1,315 @@ +name: Unit Tests + +on: + push: + branches: + - main + - 2.0.x + pull_request: + branches: + - main + - 2.0.x + paths-ignore: + - "doc/**" + - "web/**" + +permissions: + contents: read + +defaults: + run: + shell: bash -el {0} + +jobs: + ubuntu: + runs-on: ubuntu-22.04 + timeout-minutes: 180 + strategy: + matrix: + env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml, actions-311.yaml] + # Prevent the include jobs from overriding other jobs + pattern: [""] + include: + - name: "Downstream Compat" + env_file: actions-38-downstream_compat.yaml + pattern: "not slow and not network and not single_cpu" + pytest_target: "pandas/tests/test_downstream.py" + - name: "Minimum Versions" + env_file: actions-38-minimum_versions.yaml + pattern: "not slow and not network and not single_cpu" + - name: "Locale: it_IT" + env_file: actions-38.yaml + pattern: "not slow and not network and not single_cpu" + extra_apt: "language-pack-it" + # Use the utf8 version as the default, it has no bad side-effect. + lang: "it_IT.utf8" + lc_all: "it_IT.utf8" + # Also install it_IT (its encoding is ISO8859-1) but do not activate it. + # It will be temporarily activated during tests with locale.setlocale + extra_loc: "it_IT" + - name: "Locale: zh_CN" + env_file: actions-38.yaml + pattern: "not slow and not network and not single_cpu" + extra_apt: "language-pack-zh-hans" + # Use the utf8 version as the default, it has no bad side-effect. + lang: "zh_CN.utf8" + lc_all: "zh_CN.utf8" + # Also install zh_CN (its encoding is gb2312) but do not activate it. + # It will be temporarily activated during tests with locale.setlocale + extra_loc: "zh_CN" + - name: "Copy-on-Write" + env_file: actions-310.yaml + pattern: "not slow and not network and not single_cpu" + pandas_copy_on_write: "1" + - name: "Pypy" + env_file: actions-pypy-38.yaml + pattern: "not slow and not network and not single_cpu" + test_args: "--max-worker-restart 0" + - name: "Numpy Dev" + env_file: actions-310-numpydev.yaml + pattern: "not slow and not network and not single_cpu" + test_args: "-W error::DeprecationWarning -W error::FutureWarning" + # TODO(cython3): Re-enable once next-beta(after beta 1) comes out + # There are some warnings failing the build with -werror + pandas_ci: "0" + - name: "Pyarrow Nightly" + env_file: actions-311-pyarrownightly.yaml + pattern: "not slow and not network and not single_cpu" + fail-fast: false + name: ${{ matrix.name || format('ubuntu-latest {0}', matrix.env_file) }} + env: + ENV_FILE: ci/deps/${{ matrix.env_file }} + PATTERN: ${{ matrix.pattern }} + EXTRA_APT: ${{ matrix.extra_apt || '' }} + LANG: ${{ matrix.lang || '' }} + LC_ALL: ${{ matrix.lc_all || '' }} + PANDAS_COPY_ON_WRITE: ${{ matrix.pandas_copy_on_write || '0' }} + PANDAS_CI: ${{ matrix.pandas_ci || '1' }} + TEST_ARGS: ${{ matrix.test_args || '' }} + PYTEST_WORKERS: 'auto' + PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }} + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }} + cancel-in-progress: true + + services: + mysql: + image: mysql + env: + MYSQL_ALLOW_EMPTY_PASSWORD: yes + MYSQL_DATABASE: pandas + options: >- + --health-cmd "mysqladmin ping" + --health-interval 10s + --health-timeout 5s + --health-retries 5 + ports: + - 3306:3306 + + postgres: + image: postgres + env: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + POSTGRES_DB: pandas + options: >- + --health-cmd pg_isready + --health-interval 10s + --health-timeout 5s + --health-retries 5 + ports: + - 5432:5432 + + moto: + image: motoserver/moto:4.1.4 + env: + AWS_ACCESS_KEY_ID: foobar_key + AWS_SECRET_ACCESS_KEY: foobar_secret + ports: + - 5000:5000 + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Extra installs + # xsel for clipboard tests + run: sudo apt-get update && sudo apt-get install -y xsel ${{ env.EXTRA_APT }} + + - name: Generate extra locales + # These extra locales will be available for locale.setlocale() calls in tests + run: | + sudo locale-gen ${{ matrix.extra_loc }} + if: ${{ matrix.extra_loc }} + + - name: Set up Conda + uses: ./.github/actions/setup-conda + with: + environment-file: ${{ env.ENV_FILE }} + + - name: Build Pandas + id: build + uses: ./.github/actions/build_pandas + + - name: Test (not single_cpu) + uses: ./.github/actions/run-tests + if: ${{ matrix.name != 'Pypy' }} + env: + # Set pattern to not single_cpu if not already set + PATTERN: ${{ env.PATTERN == '' && 'not single_cpu' || matrix.pattern }} + + - name: Test (single_cpu) + uses: ./.github/actions/run-tests + env: + PATTERN: 'single_cpu' + PYTEST_WORKERS: 1 + if: ${{ matrix.pattern == '' && (always() && steps.build.outcome == 'success')}} + + macos-windows: + timeout-minutes: 180 + strategy: + matrix: + os: [macos-latest, windows-latest] + env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml, actions-311.yaml] + fail-fast: false + runs-on: ${{ matrix.os }} + name: ${{ format('{0} {1}', matrix.os, matrix.env_file) }} + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.os }} + cancel-in-progress: true + env: + PANDAS_CI: 1 + PYTEST_TARGET: pandas + PATTERN: "not slow and not db and not network and not single_cpu" + # GH 47443: PYTEST_WORKERS > 1 crashes Windows builds with memory related errors + PYTEST_WORKERS: ${{ matrix.os == 'macos-latest' && 'auto' || '1' }} + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Set up Conda + uses: ./.github/actions/setup-conda + with: + environment-file: ci/deps/${{ matrix.env_file }} + + - name: Build Pandas + uses: ./.github/actions/build_pandas + + - name: Test + uses: ./.github/actions/run-tests + + Linux-32-bit: + runs-on: ubuntu-22.04 + container: + image: quay.io/pypa/manylinux2014_i686 + options: --platform linux/386 + steps: + - name: Checkout pandas Repo + # actions/checkout does not work since it requires node + run: | + git config --global --add safe.directory $PWD + + if [ $GITHUB_EVENT_NAME != pull_request ]; then + git clone --recursive --branch=$GITHUB_REF_NAME https://github.com/${GITHUB_REPOSITORY}.git $GITHUB_WORKSPACE + git reset --hard $GITHUB_SHA + else + git clone --recursive https://github.com/${GITHUB_REPOSITORY}.git $GITHUB_WORKSPACE + git fetch origin $GITHUB_REF:my_ref_name + git checkout $GITHUB_BASE_REF + git -c user.email="you@example.com" merge --no-commit my_ref_name + fi + - name: Build environment and Run Tests + run: | + /opt/python/cp38-cp38/bin/python -m venv ~/virtualenvs/pandas-dev + . ~/virtualenvs/pandas-dev/bin/activate + python -m pip install --no-cache-dir --no-deps -U pip wheel setuptools + python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.0.0 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 + python setup.py build_ext -q -j$(nproc) + python -m pip install --no-cache-dir --no-build-isolation --no-use-pep517 -e . + python -m pip list + export PANDAS_CI=1 + python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-32bit + cancel-in-progress: true + + python-dev: + # This job may or may not run depending on the state of the next + # unreleased Python version. DO NOT DELETE IT. + # + # In general, this will remain frozen(present, but not running) until: + # - The next unreleased Python version has released beta 1 + # - This version should be available on GitHub Actions. + # - Our required build/runtime dependencies(numpy, pytz, Cython, python-dateutil) + # support that unreleased Python version. + # To unfreeze, comment out the ``if: false`` condition, and make sure you update + # the name of the workflow and Python version in actions/setup-python ``python-version:`` + # + # After it has been unfrozen, this file should remain unfrozen(present, and running) until: + # - The next Python version has been officially released. + # OR + # - Most/All of our optional dependencies support the next Python version AND + # - The next Python version has released a rc(we are guaranteed a stable ABI). + # To freeze this file, uncomment out the ``if: false`` condition, and migrate the jobs + # to the corresponding posix/windows-macos/sdist etc. workflows. + # Feel free to modify this comment as necessary. + if: false # Uncomment this to freeze the workflow, comment it to unfreeze + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-22.04, macOS-latest, windows-latest] + + timeout-minutes: 180 + + concurrency: + #https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.os }}-${{ matrix.pytest_target }}-dev + cancel-in-progress: true + + env: + PYTEST_WORKERS: "auto" + PANDAS_CI: 1 + PATTERN: "not slow and not network and not clipboard and not single_cpu" + COVERAGE: true + PYTEST_TARGET: pandas + + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Set up Python Dev Version + uses: actions/setup-python@v4 + with: + python-version: '3.11-dev' + + - name: Install dependencies + run: | + python --version + python -m pip install --upgrade pip setuptools wheel + python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scipy-wheels-nightly/simple numpy + python -m pip install git+https://github.com/nedbat/coveragepy.git + python -m pip install versioneer[toml] + python -m pip install python-dateutil pytz cython hypothesis>=6.46.1 pytest>=7.0.0 pytest-xdist>=2.2.0 pytest-cov pytest-asyncio>=0.17 + python -m pip list + + - name: Build Pandas + run: | + python setup.py build_ext -q -j4 + python -m pip install -e . --no-build-isolation --no-use-pep517 --no-index + + - name: Build Version + run: | + python -c "import pandas; pandas.show_versions();" + + - name: Test + uses: ./.github/actions/run-tests diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py index 499bcae5e90f0..1d969e648b752 100644 --- a/pandas/tests/tools/test_to_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -510,6 +510,8 @@ def test_ignore_downcast_neg_to_unsigned(): tm.assert_numpy_array_equal(res, expected) +# Warning in 32 bit platforms +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning") @pytest.mark.parametrize("downcast", ["integer", "signed", "unsigned"]) @pytest.mark.parametrize( "data,expected", From 25643d66da1f6d68b4ef09afbe8f7f766faeb6b3 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 25 Apr 2023 11:15:43 +0200 Subject: [PATCH 04/41] Backport PR #52824 on branch 2.0.x (BUG: interchange bitmasks not supported in interchange/from_dataframe.py) (#52907) Backport PR #52824: BUG: interchange bitmasks not supported in interchange/from_dataframe.py Co-authored-by: Marco Edward Gorelli <33491632+MarcoGorelli@users.noreply.github.com> --- doc/source/whatsnew/v2.0.2.rst | 2 + pandas/core/interchange/from_dataframe.py | 104 +++++++--------------- pandas/tests/interchange/test_impl.py | 26 ++++++ 3 files changed, 62 insertions(+), 70 deletions(-) diff --git a/doc/source/whatsnew/v2.0.2.rst b/doc/source/whatsnew/v2.0.2.rst index 0a6738cb9b3dc..09932a2d2d571 100644 --- a/doc/source/whatsnew/v2.0.2.rst +++ b/doc/source/whatsnew/v2.0.2.rst @@ -20,6 +20,8 @@ Fixed regressions Bug fixes ~~~~~~~~~ +- Bug in :func:`api.interchange.from_dataframe` was returning :class:`DataFrame`'s of incorrect sizes when called on slices (:issue:`52824`) +- Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on bitmasks (:issue:`49888`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index 998f3bc374942..45d6bdd7917c1 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -6,6 +6,8 @@ import numpy as np +from pandas.compat._optional import import_optional_dependency + import pandas as pd from pandas.core.interchange.dataframe_protocol import ( Buffer, @@ -23,7 +25,7 @@ DtypeKind.INT: {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64}, DtypeKind.UINT: {8: np.uint8, 16: np.uint16, 32: np.uint32, 64: np.uint64}, DtypeKind.FLOAT: {32: np.float32, 64: np.float64}, - DtypeKind.BOOL: {8: bool}, + DtypeKind.BOOL: {1: bool, 8: bool}, } @@ -154,7 +156,9 @@ def primitive_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: buffers = col.get_buffers() data_buff, data_dtype = buffers["data"] - data = buffer_to_ndarray(data_buff, data_dtype, col.offset, col.size()) + data = buffer_to_ndarray( + data_buff, data_dtype, offset=col.offset, length=col.size() + ) data = set_nulls(data, col, buffers["validity"]) return data, buffers @@ -192,7 +196,9 @@ def categorical_column_to_series(col: Column) -> tuple[pd.Series, Any]: buffers = col.get_buffers() codes_buff, codes_dtype = buffers["data"] - codes = buffer_to_ndarray(codes_buff, codes_dtype, col.offset, col.size()) + codes = buffer_to_ndarray( + codes_buff, codes_dtype, offset=col.offset, length=col.size() + ) # Doing module in order to not get ``IndexError`` for # out-of-bounds sentinel values in `codes` @@ -252,7 +258,7 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: Endianness.NATIVE, ) # Specify zero offset as we don't want to chunk the string data - data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=col.size()) + data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=data_buff.bufsize) # Retrieve the offsets buffer containing the index offsets demarcating # the beginning and the ending of each string @@ -261,14 +267,16 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: # meaning that it has more elements than in the data buffer, do `col.size() + 1` # here to pass a proper offsets buffer size offsets = buffer_to_ndarray( - offset_buff, offset_dtype, col.offset, length=col.size() + 1 + offset_buff, offset_dtype, offset=col.offset, length=col.size() + 1 ) null_pos = None if null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK): assert buffers["validity"], "Validity buffers cannot be empty for masks" valid_buff, valid_dtype = buffers["validity"] - null_pos = buffer_to_ndarray(valid_buff, valid_dtype, col.offset, col.size()) + null_pos = buffer_to_ndarray( + valid_buff, valid_dtype, offset=col.offset, length=col.size() + ) if sentinel_val == 0: null_pos = ~null_pos @@ -356,8 +364,8 @@ def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: getattr(ArrowCTypes, f"UINT{dtype[1]}"), Endianness.NATIVE, ), - col.offset, - col.size(), + offset=col.offset, + length=col.size(), ) data = parse_datetime_format_str(format_str, data) @@ -368,8 +376,9 @@ def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: def buffer_to_ndarray( buffer: Buffer, dtype: tuple[DtypeKind, int, str, str], + *, + length: int, offset: int = 0, - length: int | None = None, ) -> np.ndarray: """ Build a NumPy array from the passed buffer. @@ -406,74 +415,27 @@ def buffer_to_ndarray( # and size in the buffer plus the dtype on the column. Use DLPack as NumPy supports # it since https://github.com/numpy/numpy/pull/19083 ctypes_type = np.ctypeslib.as_ctypes_type(column_dtype) - data_pointer = ctypes.cast( - buffer.ptr + (offset * bit_width // 8), ctypes.POINTER(ctypes_type) - ) if bit_width == 1: assert length is not None, "`length` must be specified for a bit-mask buffer." - arr = np.ctypeslib.as_array(data_pointer, shape=(buffer.bufsize,)) - return bitmask_to_bool_ndarray(arr, length, first_byte_offset=offset % 8) + pa = import_optional_dependency("pyarrow") + arr = pa.BooleanArray.from_buffers( + pa.bool_(), + length, + [None, pa.foreign_buffer(buffer.ptr, length)], + offset=offset, + ) + return np.asarray(arr) else: + data_pointer = ctypes.cast( + buffer.ptr + (offset * bit_width // 8), ctypes.POINTER(ctypes_type) + ) return np.ctypeslib.as_array( - data_pointer, shape=(buffer.bufsize // (bit_width // 8),) + data_pointer, + shape=(length,), ) -def bitmask_to_bool_ndarray( - bitmask: np.ndarray, mask_length: int, first_byte_offset: int = 0 -) -> np.ndarray: - """ - Convert bit-mask to a boolean NumPy array. - - Parameters - ---------- - bitmask : np.ndarray[uint8] - NumPy array of uint8 dtype representing the bitmask. - mask_length : int - Number of elements in the mask to interpret. - first_byte_offset : int, default: 0 - Number of elements to offset from the start of the first byte. - - Returns - ------- - np.ndarray[bool] - """ - bytes_to_skip = first_byte_offset // 8 - bitmask = bitmask[bytes_to_skip:] - first_byte_offset %= 8 - - bool_mask = np.zeros(mask_length, dtype=bool) - - # Processing the first byte separately as it has its own offset - val = bitmask[0] - mask_idx = 0 - bits_in_first_byte = min(8 - first_byte_offset, mask_length) - for j in range(bits_in_first_byte): - if val & (1 << (j + first_byte_offset)): - bool_mask[mask_idx] = True - mask_idx += 1 - - # `mask_length // 8` describes how many full bytes to process - for i in range((mask_length - bits_in_first_byte) // 8): - # doing `+ 1` as we already processed the first byte - val = bitmask[i + 1] - for j in range(8): - if val & (1 << j): - bool_mask[mask_idx] = True - mask_idx += 1 - - if len(bitmask) > 1: - # Processing reminder of last byte - val = bitmask[-1] - for j in range(len(bool_mask) - mask_idx): - if val & (1 << j): - bool_mask[mask_idx] = True - mask_idx += 1 - - return bool_mask - - def set_nulls( data: np.ndarray | pd.Series, col: Column, @@ -509,7 +471,9 @@ def set_nulls( elif null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK): assert validity, "Expected to have a validity buffer for the mask" valid_buff, valid_dtype = validity - null_pos = buffer_to_ndarray(valid_buff, valid_dtype, col.offset, col.size()) + null_pos = buffer_to_ndarray( + valid_buff, valid_dtype, offset=col.offset, length=col.size() + ) if sentinel_val == 0: null_pos = ~null_pos elif null_kind in (ColumnNullType.NON_NULLABLE, ColumnNullType.USE_NAN): diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index a9835b8641e7d..d393ba6fd3957 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -104,6 +104,32 @@ def test_large_string_pyarrow(): assert pa.Table.equals(pa.interchange.from_dataframe(result), table) +@pytest.mark.parametrize( + ("offset", "length", "expected_values"), + [ + (0, None, [3.3, float("nan"), 2.1]), + (1, None, [float("nan"), 2.1]), + (2, None, [2.1]), + (0, 2, [3.3, float("nan")]), + (0, 1, [3.3]), + (1, 1, [float("nan")]), + ], +) +def test_bitmasks_pyarrow(offset, length, expected_values): + # GH 52795 + pa = pytest.importorskip("pyarrow", "11.0.0") + + arr = [3.3, None, 2.1] + table = pa.table({"arr": arr}).slice(offset, length) + exchange_df = table.__dataframe__() + result = from_dataframe(exchange_df) + expected = pd.DataFrame({"arr": expected_values}) + tm.assert_frame_equal(result, expected) + + # check round-trip + assert pa.Table.equals(pa.interchange.from_dataframe(result), table) + + @pytest.mark.parametrize( "data", [int_data, uint_data, float_data, bool_data, datetime_data] ) From 6bc51af0e340ef4a345f60002dc74aa720e107c1 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 26 Apr 2023 00:22:48 +0200 Subject: [PATCH 05/41] Backport PR #52916 on branch 2.0.x (CI: Bump GHA versions) (#52918) Backport PR #52916: CI: Bump GHA versions Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- .github/actions/run-tests/action.yml | 4 ++-- .github/actions/setup-conda/action.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/actions/run-tests/action.yml b/.github/actions/run-tests/action.yml index 2a7601f196ec4..fd7c3587f2254 100644 --- a/.github/actions/run-tests/action.yml +++ b/.github/actions/run-tests/action.yml @@ -7,7 +7,7 @@ runs: shell: bash -el {0} - name: Publish test results - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v3 with: name: Test results path: test-data.xml @@ -19,7 +19,7 @@ runs: if: failure() - name: Upload coverage to Codecov - uses: codecov/codecov-action@v2 + uses: codecov/codecov-action@v3 with: flags: unittests name: codecov-pandas diff --git a/.github/actions/setup-conda/action.yml b/.github/actions/setup-conda/action.yml index b667075e87144..8aa417c1d8fd4 100644 --- a/.github/actions/setup-conda/action.yml +++ b/.github/actions/setup-conda/action.yml @@ -13,7 +13,7 @@ runs: using: composite steps: - name: Install ${{ inputs.environment-file }} - uses: mamba-org/provision-with-micromamba@v12 + uses: mamba-org/provision-with-micromamba@v15 with: environment-file: ${{ inputs.environment-file }} environment-name: ${{ inputs.environment-name }} From a1c6fe0620f4322dc9b6c2265e3602c373f2f06c Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 26 Apr 2023 20:28:43 +0200 Subject: [PATCH 06/41] Backport PR #52906 on branch 2.0.x (DEPS: Address numpy deprecation of len 1 arrays assignment) (#52933) Backport PR #52906: DEPS: Address numpy deprecation of len 1 arrays assignment Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/core/indexing.py | 7 +++++++ pandas/core/internals/base.py | 4 ++++ pandas/core/internals/blocks.py | 4 +++- pandas/io/parsers/base_parser.py | 10 +++++++++- pandas/tests/groupby/test_groupby_dropna.py | 2 +- 5 files changed, 24 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index afb5ab036b5b5..f8d78d21f74df 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1768,6 +1768,13 @@ def _setitem_with_indexer(self, indexer, value, name: str = "iloc"): if not isinstance(value, ABCSeries): # if not Series (in which case we need to align), # we can short-circuit + if ( + isinstance(arr, np.ndarray) + and arr.ndim == 1 + and len(arr) == 1 + ): + # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615 + arr = arr[0, ...] empty_value[indexer[0]] = arr self.obj[key] = empty_value return diff --git a/pandas/core/internals/base.py b/pandas/core/internals/base.py index bb5d7e839a98c..bf48e1ff0a653 100644 --- a/pandas/core/internals/base.py +++ b/pandas/core/internals/base.py @@ -186,6 +186,10 @@ def setitem_inplace(self, indexer, value) -> None: # dt64/td64, which do their own validation. value = np_can_hold_element(arr.dtype, value) + if isinstance(value, np.ndarray) and value.ndim == 1 and len(value) == 1: + # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615 + value = value[0, ...] + arr[indexer] = value def grouped_reduce(self, func): diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index cb336d2f718a6..b2a6b1fa39219 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1061,7 +1061,9 @@ def setitem(self, indexer, value, using_cow: bool = False) -> Block: self = self.make_block_same_class( values.T if values.ndim == 2 else values ) - + if isinstance(casted, np.ndarray) and casted.ndim == 1 and len(casted) == 1: + # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615 + casted = casted[0, ...] values[indexer] = casted return self diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 3acb1073bac93..f354fe9a53b48 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -1113,6 +1113,12 @@ def _make_date_converter( if date_parser is not lib.no_default and date_format is not None: raise TypeError("Cannot use both 'date_parser' and 'date_format'") + def unpack_if_single_element(arg): + # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615 + if isinstance(arg, np.ndarray) and arg.ndim == 1 and len(arg) == 1: + return arg[0] + return arg + def converter(*date_cols, col: Hashable): if date_parser is lib.no_default: strs = parsing.concat_date_cols(date_cols) @@ -1136,7 +1142,9 @@ def converter(*date_cols, col: Hashable): else: try: result = tools.to_datetime( - date_parser(*date_cols), errors="ignore", cache=cache_dates + date_parser(*(unpack_if_single_element(arg) for arg in date_cols)), + errors="ignore", + cache=cache_dates, ) if isinstance(result, datetime.datetime): raise Exception("scalar parser") diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 31a8e7a7d36ac..1ef5bc9925f95 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -620,7 +620,7 @@ def test_categorical_transformers( result = getattr(gb_keepna, transformation_func)(*args) expected = getattr(gb_dropna, transformation_func)(*args) for iloc, value in zip( - df[df["x"].isnull()].index.tolist(), null_group_result.values + df[df["x"].isnull()].index.tolist(), null_group_result.values.ravel() ): if expected.ndim == 1: expected.iloc[iloc] = value From cab4cf4511617e3421d3065e3ad16cde601c0156 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 26 Apr 2023 20:28:55 +0200 Subject: [PATCH 07/41] Backport PR #52929 on branch 2.0.x (TST/BUG: pyarrow test fixtures upcasting dtypes) (#52934) Backport PR #52929: TST/BUG: pyarrow test fixtures upcasting dtypes Co-authored-by: Luke Manley --- pandas/tests/extension/test_arrow.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index d557f5efc0a8a..bc642faeed163 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -131,7 +131,7 @@ def data(dtype): @pytest.fixture def data_missing(data): """Length-2 array with [NA, Valid]""" - return type(data)._from_sequence([None, data[0]]) + return type(data)._from_sequence([None, data[0]], dtype=data.dtype) @pytest.fixture(params=["data", "data_missing"]) @@ -214,7 +214,8 @@ def data_for_sorting(data_for_grouping): A < B < C """ return type(data_for_grouping)._from_sequence( - [data_for_grouping[0], data_for_grouping[7], data_for_grouping[4]] + [data_for_grouping[0], data_for_grouping[7], data_for_grouping[4]], + dtype=data_for_grouping.dtype, ) @@ -227,7 +228,8 @@ def data_missing_for_sorting(data_for_grouping): A < B and NA missing. """ return type(data_for_grouping)._from_sequence( - [data_for_grouping[0], data_for_grouping[2], data_for_grouping[4]] + [data_for_grouping[0], data_for_grouping[2], data_for_grouping[4]], + dtype=data_for_grouping.dtype, ) From 91757c5e5c39e914b58cbf32f710a45b6a4a418f Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 27 Apr 2023 04:15:28 +0200 Subject: [PATCH 08/41] Backport PR #52591 on branch 2.0.x (BUG: pd.array raising with NumPy array and large dtype) (#52951) Backport PR #52591: BUG: pd.array raising with NumPy array and large dtype Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.0.2.rst | 1 + pandas/core/arrays/arrow/array.py | 10 ++++++++++ pandas/tests/extension/test_arrow.py | 14 ++++++++++++++ 3 files changed, 25 insertions(+) diff --git a/doc/source/whatsnew/v2.0.2.rst b/doc/source/whatsnew/v2.0.2.rst index 09932a2d2d571..f6b0b4086cb39 100644 --- a/doc/source/whatsnew/v2.0.2.rst +++ b/doc/source/whatsnew/v2.0.2.rst @@ -22,6 +22,7 @@ Bug fixes ~~~~~~~~~ - Bug in :func:`api.interchange.from_dataframe` was returning :class:`DataFrame`'s of incorrect sizes when called on slices (:issue:`52824`) - Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on bitmasks (:issue:`49888`) +- Bug in :meth:`pd.array` raising for ``NumPy`` array and ``pa.large_string`` or ``pa.large_binary`` (:issue:`52590`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index dd8484050ef89..0ebbecc6128a5 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -247,6 +247,16 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal Construct a new ExtensionArray from a sequence of scalars. """ pa_dtype = to_pyarrow_type(dtype) + if ( + isinstance(scalars, np.ndarray) + and isinstance(dtype, ArrowDtype) + and ( + pa.types.is_large_binary(pa_dtype) or pa.types.is_large_string(pa_dtype) + ) + ): + # See https://github.com/apache/arrow/issues/35289 + scalars = scalars.tolist() + if isinstance(scalars, cls): scalars = scalars._data elif not isinstance(scalars, (pa.Array, pa.ChunkedArray)): diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index bc642faeed163..7fe97cec8e3cb 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2620,6 +2620,20 @@ def test_setitem_boolean_replace_with_mask_segfault(): assert arr._data == expected._data +@pytest.mark.parametrize( + "data, arrow_dtype", + [ + ([b"a", b"b"], pa.large_binary()), + (["a", "b"], pa.large_string()), + ], +) +def test_conversion_large_dtypes_from_numpy_array(data, arrow_dtype): + dtype = ArrowDtype(arrow_dtype) + result = pd.array(np.array(data), dtype=dtype) + expected = pd.array(data, dtype=dtype) + tm.assert_extension_array_equal(result, expected) + + @pytest.mark.parametrize("pa_type", tm.ALL_INT_PYARROW_DTYPES + tm.FLOAT_PYARROW_DTYPES) def test_describe_numeric_data(pa_type): # GH 52470 From 79abb4435d2c59315a63d7387fa3c0dfba64da1b Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 27 Apr 2023 16:33:03 +0200 Subject: [PATCH 09/41] Backport PR #52872 on branch 2.0.x (BUG: convert_dtypes ingoring convert keywords for pyarrow backend) (#52959) Backport PR #52872: BUG: convert_dtypes ingoring convert keywords for pyarrow backend Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.0.2.rst | 1 + pandas/core/dtypes/cast.py | 37 ++++++++++++------- .../frame/methods/test_convert_dtypes.py | 14 +++++++ 3 files changed, 38 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v2.0.2.rst b/doc/source/whatsnew/v2.0.2.rst index f6b0b4086cb39..adfebd857b390 100644 --- a/doc/source/whatsnew/v2.0.2.rst +++ b/doc/source/whatsnew/v2.0.2.rst @@ -22,6 +22,7 @@ Bug fixes ~~~~~~~~~ - Bug in :func:`api.interchange.from_dataframe` was returning :class:`DataFrame`'s of incorrect sizes when called on slices (:issue:`52824`) - Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on bitmasks (:issue:`49888`) +- Bug in :meth:`DataFrame.convert_dtypes` ignores ``convert_*`` keywords when set to False ``dtype_backend="pyarrow"`` (:issue:`52872`) - Bug in :meth:`pd.array` raising for ``NumPy`` array and ``pa.large_string`` or ``pa.large_binary`` (:issue:`52590`) - diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 1e2192f4c7691..05e108099fd5a 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1122,20 +1122,29 @@ def convert_dtypes( from pandas.core.arrays.arrow.dtype import ArrowDtype from pandas.core.arrays.string_ import StringDtype - if isinstance(inferred_dtype, PandasExtensionDtype): - base_dtype = inferred_dtype.base - elif isinstance(inferred_dtype, (BaseMaskedDtype, ArrowDtype)): - base_dtype = inferred_dtype.numpy_dtype - elif isinstance(inferred_dtype, StringDtype): - base_dtype = np.dtype(str) - else: - # error: Incompatible types in assignment (expression has type - # "Union[str, Any, dtype[Any], ExtensionDtype]", - # variable has type "Union[dtype[Any], ExtensionDtype, None]") - base_dtype = inferred_dtype # type: ignore[assignment] - pa_type = to_pyarrow_type(base_dtype) - if pa_type is not None: - inferred_dtype = ArrowDtype(pa_type) + assert not isinstance(inferred_dtype, str) + + if ( + (convert_integer and inferred_dtype.kind in "iu") + or (convert_floating and inferred_dtype.kind in "fc") + or (convert_boolean and inferred_dtype.kind == "b") + or (convert_string and isinstance(inferred_dtype, StringDtype)) + or ( + inferred_dtype.kind not in "iufcb" + and not isinstance(inferred_dtype, StringDtype) + ) + ): + if isinstance(inferred_dtype, PandasExtensionDtype): + base_dtype = inferred_dtype.base + elif isinstance(inferred_dtype, (BaseMaskedDtype, ArrowDtype)): + base_dtype = inferred_dtype.numpy_dtype + elif isinstance(inferred_dtype, StringDtype): + base_dtype = np.dtype(str) + else: + base_dtype = inferred_dtype + pa_type = to_pyarrow_type(base_dtype) + if pa_type is not None: + inferred_dtype = ArrowDtype(pa_type) # error: Incompatible return value type (got "Union[str, Union[dtype[Any], # ExtensionDtype]]", expected "Union[dtype[Any], ExtensionDtype]") diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index 6076933eecec4..a749cd11df4f7 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -134,3 +134,17 @@ def test_pyarrow_engine_lines_false(self): ) with pytest.raises(ValueError, match=msg): df.convert_dtypes(dtype_backend="numpy") + + def test_pyarrow_backend_no_convesion(self): + # GH#52872 + pytest.importorskip("pyarrow") + df = pd.DataFrame({"a": [1, 2], "b": 1.5, "c": True, "d": "x"}) + expected = df.copy() + result = df.convert_dtypes( + convert_floating=False, + convert_integer=False, + convert_boolean=False, + convert_string=False, + dtype_backend="pyarrow", + ) + tm.assert_frame_equal(result, expected) From b56c6ce5f5e1099c4af9681ac66cf70738351719 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 27 Apr 2023 18:13:16 +0200 Subject: [PATCH 10/41] Backport PR #52949 on branch 2.0.x (DOC: Clean up for deprecations) (#52962) Backport PR #52949: DOC: Clean up for deprecations Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/core/indexes/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index e8969e90e6318..890ac084680a1 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2521,7 +2521,7 @@ def is_categorical(self) -> bool: Check if the Index holds categorical data. .. deprecated:: 2.0.0 - Use :meth:`pandas.api.types.is_categorical_dtype` instead. + Use `isinstance(index.dtype, pd.CategoricalDtype)` instead. Returns ------- @@ -2574,7 +2574,7 @@ def is_interval(self) -> bool: Check if the Index holds Interval objects. .. deprecated:: 2.0.0 - Use `pandas.api.types.is_interval_dtype` instead. + Use `isinstance(index.dtype, pd.IntervalDtype)` instead. Returns ------- From 8475f2fccf28ced9dd0b8bf3723301a26fb39fb6 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 28 Apr 2023 12:39:05 -0700 Subject: [PATCH 11/41] Backport PR #52794: CI: Remove redundant sdist workflow (#52984) --- .github/workflows/sdist.yml | 95 ------------------------------------ .github/workflows/wheels.yml | 5 +- 2 files changed, 4 insertions(+), 96 deletions(-) delete mode 100644 .github/workflows/sdist.yml diff --git a/.github/workflows/sdist.yml b/.github/workflows/sdist.yml deleted file mode 100644 index 284935d7051e1..0000000000000 --- a/.github/workflows/sdist.yml +++ /dev/null @@ -1,95 +0,0 @@ -name: sdist - -on: - push: - branches: - - main - - 2.0.x - pull_request: - branches: - - main - - 2.0.x - types: [labeled, opened, synchronize, reopened] - paths-ignore: - - "doc/**" - -permissions: - contents: read - -jobs: - build: - if: ${{ github.event.label.name == 'Build' || contains(github.event.pull_request.labels.*.name, 'Build') || github.event_name == 'push'}} - runs-on: ubuntu-22.04 - timeout-minutes: 60 - defaults: - run: - shell: bash -el {0} - - strategy: - fail-fast: false - matrix: - python-version: ["3.8", "3.9", "3.10", "3.11"] - concurrency: - # https://github.community/t/concurrecy-not-work-for-push/183068/7 - group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{matrix.python-version}}-sdist - cancel-in-progress: true - - steps: - - uses: actions/checkout@v3 - with: - fetch-depth: 0 - - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - - - name: Install dependencies - run: | - python -m pip install --upgrade pip setuptools wheel - python -m pip install versioneer[toml] - - # GH 39416 - pip install numpy - - - name: Build pandas sdist - run: | - pip list - python setup.py sdist --formats=gztar - - - name: Upload sdist artifact - uses: actions/upload-artifact@v3 - with: - name: ${{matrix.python-version}}-sdist.gz - path: dist/*.gz - - - name: Set up Conda - uses: ./.github/actions/setup-conda - with: - environment-file: false - environment-name: pandas-sdist - extra-specs: | - python =${{ matrix.python-version }} - - - name: Install pandas from sdist - run: | - pip list - python -m pip install dist/*.gz - - - name: Force oldest supported NumPy - run: | - case "${{matrix.python-version}}" in - 3.8) - pip install numpy==1.20.3 ;; - 3.9) - pip install numpy==1.20.3 ;; - 3.10) - pip install numpy==1.21.2 ;; - 3.11) - pip install numpy==1.23.2 ;; - esac - - - name: Import pandas - run: | - cd .. - python -c "import pandas; pandas.show_versions();" diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 07f3224679e90..c9e44fae43669 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -23,7 +23,10 @@ on: - cron: "27 3 */1 * *" push: pull_request: - types: [labeled, opened, synchronize, reopened] + types: [labeled, opened, synchronize, reopened] + paths-ignore: + - "doc/**" + - "web/**" workflow_dispatch: concurrency: From bfe810b6f748bfd41f47781a4e658055e961df7a Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 29 Apr 2023 20:05:28 +0200 Subject: [PATCH 12/41] Backport PR #52982 on branch 2.0.x (BUG: Have non nano rounding noop return copy) (#53004) Backport PR #52982: BUG: Have non nano rounding noop return copy Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/core/arrays/datetimelike.py | 2 +- pandas/tests/series/accessors/test_dt_accessor.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 843e9be6de14a..8545cd1499b5e 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -2010,7 +2010,7 @@ def _round(self, freq, mode, ambiguous, nonexistent): nanos = delta_to_nanoseconds(offset, self._creso) if nanos == 0: # GH 52761 - return self + return self.copy() result_i8 = round_nsint64(values, mode, nanos) result = self._maybe_mask_results(result_i8, fill_value=iNaT) result = result.view(self._ndarray.dtype) diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index 74371830f3a19..e211bc233b9cf 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -392,6 +392,8 @@ def test_dt_round_nonnano_higher_resolution_no_op(self, freq): result = ser.dt.round(freq) tm.assert_series_equal(result, expected) + assert not np.shares_memory(ser.array._ndarray, result.array._ndarray) + def test_dt_namespace_accessor_categorical(self): # GH 19468 dti = DatetimeIndex(["20171111", "20181212"]).repeat(2) From fbbdac50b82b6e83d32363897b8a34f5fdf64274 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Sat, 29 Apr 2023 11:31:45 -0700 Subject: [PATCH 13/41] Backport PR #52952: ERR: Raise a better error message with to_pydatetime and ArrowDtype(pa.date) (#52989) * Backport PR #52952: ERR: Raise a better error message with to_pydatetime and ArrowDtype(pa.date) * Update pandas/tests/extension/test_arrow.py --- doc/source/whatsnew/v2.0.2.rst | 2 +- pandas/core/arrays/arrow/array.py | 5 +++++ pandas/tests/extension/test_arrow.py | 11 +++++++++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.2.rst b/doc/source/whatsnew/v2.0.2.rst index adfebd857b390..3ee7031795d16 100644 --- a/doc/source/whatsnew/v2.0.2.rst +++ b/doc/source/whatsnew/v2.0.2.rst @@ -31,7 +31,7 @@ Bug fixes Other ~~~~~ -- +- Raised a better error message when calling :func:`Series.dt.to_pydatetime` with :class:`ArrowDtype` with ``pyarrow.date32`` or ``pyarrow.date64`` type (:issue:`52812`) .. --------------------------------------------------------------------------- .. _whatsnew_202.contributors: diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 0ebbecc6128a5..fb56a98fc87cc 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2161,6 +2161,11 @@ def _dt_round( return self._round_temporally("round", freq, ambiguous, nonexistent) def _dt_to_pydatetime(self): + if pa.types.is_date(self.dtype.pyarrow_dtype): + raise ValueError( + f"to_pydatetime cannot be called with {self.dtype.pyarrow_dtype} type. " + "Convert to pyarrow timestamp type." + ) data = self._data.to_pylist() if self._dtype.pyarrow_dtype.unit == "ns": data = [None if ts is None else ts.to_pydatetime(warn=False) for ts in data] diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 7fe97cec8e3cb..fa21d861b4240 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2512,6 +2512,17 @@ def test_dt_to_pydatetime(): tm.assert_numpy_array_equal(result, expected) +@pytest.mark.parametrize("date_type", [32, 64]) +def test_dt_to_pydatetime_date_error(date_type): + # GH 52812 + ser = pd.Series( + [date(2022, 12, 31)], + dtype=ArrowDtype(getattr(pa, f"date{date_type}")()), + ) + with pytest.raises(ValueError, match="to_pydatetime cannot be called with"): + ser.dt.to_pydatetime() + + def test_dt_tz_localize_unsupported_tz_options(): ser = pd.Series( [datetime(year=2023, month=1, day=2, hour=3), None], From 9135c3aaf12d26f857fcc787a5b64d521c51e379 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Mon, 1 May 2023 23:49:02 -0400 Subject: [PATCH 14/41] Backport PR #53001 on branch 2.0.x (BUG: Series.describe treating pyarrow timestamps/timedeltas as categorical) (#53031) * Backport PR #53001: BUG: Series.describe treating pyarrow timestamps/timedeltas as categorical * clean --- doc/source/whatsnew/v2.0.2.rst | 1 + pandas/core/methods/describe.py | 15 ++++++++------ pandas/tests/extension/test_arrow.py | 30 ++++++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v2.0.2.rst b/doc/source/whatsnew/v2.0.2.rst index 3ee7031795d16..8322c8408a0e3 100644 --- a/doc/source/whatsnew/v2.0.2.rst +++ b/doc/source/whatsnew/v2.0.2.rst @@ -23,6 +23,7 @@ Bug fixes - Bug in :func:`api.interchange.from_dataframe` was returning :class:`DataFrame`'s of incorrect sizes when called on slices (:issue:`52824`) - Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on bitmasks (:issue:`49888`) - Bug in :meth:`DataFrame.convert_dtypes` ignores ``convert_*`` keywords when set to False ``dtype_backend="pyarrow"`` (:issue:`52872`) +- Bug in :meth:`Series.describe` treating pyarrow-backed timestamps and timedeltas as categorical data (:issue:`53001`) - Bug in :meth:`pd.array` raising for ``NumPy`` array and ``pa.large_string`` or ``pa.large_binary`` (:issue:`52590`) - diff --git a/pandas/core/methods/describe.py b/pandas/core/methods/describe.py index ef2cf8e96782d..ccd9ccfff808b 100644 --- a/pandas/core/methods/describe.py +++ b/pandas/core/methods/describe.py @@ -31,11 +31,10 @@ from pandas.core.dtypes.common import ( is_bool_dtype, is_complex_dtype, - is_datetime64_any_dtype, is_extension_array_dtype, is_numeric_dtype, - is_timedelta64_dtype, ) +from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.arrays.arrow.dtype import ArrowDtype from pandas.core.arrays.floating import Float64Dtype @@ -232,9 +231,13 @@ def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series: dtype: DtypeObj | None if is_extension_array_dtype(series): if isinstance(series.dtype, ArrowDtype): - import pyarrow as pa + if series.dtype.kind == "m": + # GH53001: describe timedeltas with object dtype + dtype = None + else: + import pyarrow as pa - dtype = ArrowDtype(pa.float64()) + dtype = ArrowDtype(pa.float64()) else: dtype = Float64Dtype() elif is_numeric_dtype(series) and not is_complex_dtype(series): @@ -362,9 +365,9 @@ def select_describe_func( return describe_categorical_1d elif is_numeric_dtype(data): return describe_numeric_1d - elif is_datetime64_any_dtype(data.dtype): + elif data.dtype.kind == "M" or isinstance(data.dtype, DatetimeTZDtype): return describe_timestamp_1d - elif is_timedelta64_dtype(data.dtype): + elif data.dtype.kind == "m": return describe_numeric_1d else: return describe_categorical_1d diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index fa21d861b4240..abad641819ed1 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2658,6 +2658,36 @@ def test_describe_numeric_data(pa_type): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("pa_type", tm.TIMEDELTA_PYARROW_DTYPES) +def test_describe_timedelta_data(pa_type): + # GH53001 + data = pd.Series(range(1, 10), dtype=ArrowDtype(pa_type)) + result = data.describe() + expected = pd.Series( + [9] + pd.to_timedelta([5, 2, 1, 3, 5, 7, 9], unit=pa_type.unit).tolist(), + dtype=object, + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("pa_type", tm.DATETIME_PYARROW_DTYPES) +def test_describe_datetime_data(pa_type): + # GH53001 + data = pd.Series(range(1, 10), dtype=ArrowDtype(pa_type)) + result = data.describe() + expected = pd.Series( + [9] + + [ + pd.Timestamp(v, tz=pa_type.tz, unit=pa_type.unit) + for v in [5, 1, 3, 5, 7, 9] + ], + dtype=object, + index=["count", "mean", "min", "25%", "50%", "75%", "max"], + ) + tm.assert_series_equal(result, expected) + + @pytest.mark.xfail( pa_version_under8p0, reason="Function 'add_checked' has no kernel matching input types", From 1bb9365df1907174267c659d0ece9d903ffbf588 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 4 May 2023 22:15:55 +0200 Subject: [PATCH 15/41] Backport PR #53078 on branch 2.0.x (BUG: pd.api.interchange.from_pandas raises with all-null categorical) (#53081) Backport PR #53078: BUG: pd.api.interchange.from_pandas raises with all-null categorical Co-authored-by: Marco Edward Gorelli <33491632+MarcoGorelli@users.noreply.github.com> --- doc/source/whatsnew/v2.0.2.rst | 1 + pandas/core/interchange/from_dataframe.py | 5 ++++- pandas/tests/interchange/test_impl.py | 12 ++++++++++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.2.rst b/doc/source/whatsnew/v2.0.2.rst index 8322c8408a0e3..71c31e61fa6a3 100644 --- a/doc/source/whatsnew/v2.0.2.rst +++ b/doc/source/whatsnew/v2.0.2.rst @@ -20,6 +20,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ +- Bug in :func:`api.interchange.from_dataframe` was raising ``IndexError`` on empty categorical data (:issue:`53077`) - Bug in :func:`api.interchange.from_dataframe` was returning :class:`DataFrame`'s of incorrect sizes when called on slices (:issue:`52824`) - Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on bitmasks (:issue:`49888`) - Bug in :meth:`DataFrame.convert_dtypes` ignores ``convert_*`` keywords when set to False ``dtype_backend="pyarrow"`` (:issue:`52872`) diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index 45d6bdd7917c1..78e530f915117 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -202,7 +202,10 @@ def categorical_column_to_series(col: Column) -> tuple[pd.Series, Any]: # Doing module in order to not get ``IndexError`` for # out-of-bounds sentinel values in `codes` - values = categories[codes % len(categories)] + if len(categories) > 0: + values = categories[codes % len(categories)] + else: + values = codes cat = pd.Categorical( values, categories=categories, ordered=categorical["is_ordered"] diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index d393ba6fd3957..301cccec9e0ed 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -89,6 +89,18 @@ def test_categorical_pyarrow(): tm.assert_frame_equal(result, expected) +def test_empty_categorical_pyarrow(): + # https://github.com/pandas-dev/pandas/issues/53077 + pa = pytest.importorskip("pyarrow", "11.0.0") + + arr = [None] + table = pa.table({"arr": pa.array(arr, "float64").dictionary_encode()}) + exchange_df = table.__dataframe__() + result = pd.api.interchange.from_dataframe(exchange_df) + expected = pd.DataFrame({"arr": pd.Categorical([np.nan])}) + tm.assert_frame_equal(result, expected) + + def test_large_string_pyarrow(): # GH 52795 pa = pytest.importorskip("pyarrow", "11.0.0") From 71601f5aacb0bf922db65690498992278ad39552 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 4 May 2023 22:32:22 +0200 Subject: [PATCH 16/41] Backport PR #53060 on branch 2.0.x (REGR: df.loc setitem losing midx names) (#53080) Backport PR #53060: REGR: df.loc setitem losing midx names Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.0.2.rst | 1 + pandas/core/frame.py | 7 ++++++- pandas/tests/indexing/multiindex/test_setitem.py | 15 +++++++++++++++ 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.2.rst b/doc/source/whatsnew/v2.0.2.rst index 71c31e61fa6a3..726ff60f7a197 100644 --- a/doc/source/whatsnew/v2.0.2.rst +++ b/doc/source/whatsnew/v2.0.2.rst @@ -13,6 +13,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ +- Fixed regression in :meth:`DataFrame.loc` losing :class:`MultiIndex` name when enlarging object (:issue:`53053`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index da61d5e88a882..8be8ed188cce2 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9538,7 +9538,12 @@ def _append( "or if the Series has a name" ) - index = Index([other.name], name=self.index.name) + index = Index( + [other.name], + name=self.index.names + if isinstance(self.index, MultiIndex) + else self.index.name, + ) row_df = other.to_frame().T # infer_objects is needed for # test_append_empty_frame_to_series_with_dateutil_tz diff --git a/pandas/tests/indexing/multiindex/test_setitem.py b/pandas/tests/indexing/multiindex/test_setitem.py index 27b4a4be73032..e6f44359a1a62 100644 --- a/pandas/tests/indexing/multiindex/test_setitem.py +++ b/pandas/tests/indexing/multiindex/test_setitem.py @@ -479,6 +479,21 @@ def test_setitem_new_column_all_na(self): df["new"] = s assert df["new"].isna().all() + def test_setitem_enlargement_keep_index_names(self): + # GH#53053 + mi = MultiIndex.from_tuples([(1, 2, 3)], names=["i1", "i2", "i3"]) + df = DataFrame(data=[[10, 20, 30]], index=mi, columns=["A", "B", "C"]) + df.loc[(0, 0, 0)] = df.loc[(1, 2, 3)] + mi_expected = MultiIndex.from_tuples( + [(1, 2, 3), (0, 0, 0)], names=["i1", "i2", "i3"] + ) + expected = DataFrame( + data=[[10, 20, 30], [10, 20, 30]], + index=mi_expected, + columns=["A", "B", "C"], + ) + tm.assert_frame_equal(df, expected) + @td.skip_array_manager_invalid_test # df["foo"] select multiple columns -> .values # is not a view From 70d80d8bdca63097252e92d765486dbb6c64ec71 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 5 May 2023 21:26:53 +0200 Subject: [PATCH 17/41] Backport PR #53055 on branch 2.0.x (BUG: Fix regression when printing backslash in DataFrame.to_string) (#53107) Backport PR #53055: BUG: Fix regression when printing backslash in DataFrame.to_string Co-authored-by: Gianluca Ficarelli <26835404+GianlucaFicarelli@users.noreply.github.com> --- doc/source/whatsnew/v2.0.2.rst | 2 +- pandas/io/formats/string.py | 7 +-- pandas/tests/io/formats/test_format.py | 83 ++++++++++++++++++++++++-- 3 files changed, 81 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v2.0.2.rst b/doc/source/whatsnew/v2.0.2.rst index 726ff60f7a197..ae251f25de578 100644 --- a/doc/source/whatsnew/v2.0.2.rst +++ b/doc/source/whatsnew/v2.0.2.rst @@ -14,7 +14,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :meth:`DataFrame.loc` losing :class:`MultiIndex` name when enlarging object (:issue:`53053`) -- +- Fixed regression in :meth:`DataFrame.to_string` printing a backslash at the end of the first row of data, instead of headers, when the DataFrame doesn't fit the line width (:issue:`53054`) .. --------------------------------------------------------------------------- .. _whatsnew_202.bug_fixes: diff --git a/pandas/io/formats/string.py b/pandas/io/formats/string.py index 071afc059b166..c143988bdc885 100644 --- a/pandas/io/formats/string.py +++ b/pandas/io/formats/string.py @@ -135,12 +135,6 @@ def _join_multiline(self, strcols_input: Iterable[list[str]]) -> str: col_bins = _binify(col_widths, lwidth) nbins = len(col_bins) - if self.fmt.is_truncated_vertically: - assert self.fmt.max_rows_fitted is not None - nrows = self.fmt.max_rows_fitted + 1 - else: - nrows = len(self.frame) - str_lst = [] start = 0 for i, end in enumerate(col_bins): @@ -148,6 +142,7 @@ def _join_multiline(self, strcols_input: Iterable[list[str]]) -> str: if self.fmt.index: row.insert(0, idx) if nbins > 1: + nrows = len(row[-1]) if end <= len(strcols) and i < nbins - 1: row.append([" \\"] + [" "] * (nrows - 1)) else: diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 0d84ecf955700..fcb7f6657beac 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -1397,25 +1397,100 @@ def test_to_string_no_index(self): assert df_s == expected def test_to_string_line_width_no_index(self): - # GH 13998, GH 22505, # GH 49230 + # GH 13998, GH 22505 df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) df_s = df.to_string(line_width=1, index=False) - expected = " x \n 1 \\\n 2 \n 3 \n\n y \n 4 \n 5 \n 6 " + expected = " x \\\n 1 \n 2 \n 3 \n\n y \n 4 \n 5 \n 6 " assert df_s == expected df = DataFrame({"x": [11, 22, 33], "y": [4, 5, 6]}) df_s = df.to_string(line_width=1, index=False) - expected = " x \n11 \\\n22 \n33 \n\n y \n 4 \n 5 \n 6 " + expected = " x \\\n11 \n22 \n33 \n\n y \n 4 \n 5 \n 6 " assert df_s == expected df = DataFrame({"x": [11, 22, -33], "y": [4, 5, -6]}) df_s = df.to_string(line_width=1, index=False) - expected = " x \n 11 \\\n 22 \n-33 \n\n y \n 4 \n 5 \n-6 " + expected = " x \\\n 11 \n 22 \n-33 \n\n y \n 4 \n 5 \n-6 " + + assert df_s == expected + + def test_to_string_line_width_no_header(self): + # GH 53054 + df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) + + df_s = df.to_string(line_width=1, header=False) + expected = "0 1 \\\n1 2 \n2 3 \n\n0 4 \n1 5 \n2 6 " + + assert df_s == expected + + df = DataFrame({"x": [11, 22, 33], "y": [4, 5, 6]}) + + df_s = df.to_string(line_width=1, header=False) + expected = "0 11 \\\n1 22 \n2 33 \n\n0 4 \n1 5 \n2 6 " + + assert df_s == expected + + df = DataFrame({"x": [11, 22, -33], "y": [4, 5, -6]}) + + df_s = df.to_string(line_width=1, header=False) + expected = "0 11 \\\n1 22 \n2 -33 \n\n0 4 \n1 5 \n2 -6 " + + assert df_s == expected + + def test_to_string_line_width_no_index_no_header(self): + # GH 53054 + df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) + + df_s = df.to_string(line_width=1, index=False, header=False) + expected = "1 \\\n2 \n3 \n\n4 \n5 \n6 " + + assert df_s == expected + + df = DataFrame({"x": [11, 22, 33], "y": [4, 5, 6]}) + + df_s = df.to_string(line_width=1, index=False, header=False) + expected = "11 \\\n22 \n33 \n\n4 \n5 \n6 " + + assert df_s == expected + + df = DataFrame({"x": [11, 22, -33], "y": [4, 5, -6]}) + + df_s = df.to_string(line_width=1, index=False, header=False) + expected = " 11 \\\n 22 \n-33 \n\n 4 \n 5 \n-6 " + + assert df_s == expected + + def test_to_string_line_width_with_both_index_and_header(self): + # GH 53054 + df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) + + df_s = df.to_string(line_width=1) + expected = ( + " x \\\n0 1 \n1 2 \n2 3 \n\n y \n0 4 \n1 5 \n2 6 " + ) + + assert df_s == expected + + df = DataFrame({"x": [11, 22, 33], "y": [4, 5, 6]}) + + df_s = df.to_string(line_width=1) + expected = ( + " x \\\n0 11 \n1 22 \n2 33 \n\n y \n0 4 \n1 5 \n2 6 " + ) + + assert df_s == expected + + df = DataFrame({"x": [11, 22, -33], "y": [4, 5, -6]}) + + df_s = df.to_string(line_width=1) + expected = ( + " x \\\n0 11 \n1 22 \n2 -33 \n\n y \n0 4 \n1 5 \n2 -6 " + ) assert df_s == expected From 44f0a9b6361d23588cab990420d64c707a06a112 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 6 May 2023 16:42:11 +0200 Subject: [PATCH 18/41] Backport PR #53102 on branch 2.0.x (REGR: MultiIndex.join not resorting levels of new index) (#53113) Backport PR #53102: REGR: MultiIndex.join not resorting levels of new index Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.0.2.rst | 2 ++ pandas/core/indexes/base.py | 2 +- pandas/tests/indexes/multi/test_join.py | 12 ++++++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.2.rst b/doc/source/whatsnew/v2.0.2.rst index ae251f25de578..7864791f8bc59 100644 --- a/doc/source/whatsnew/v2.0.2.rst +++ b/doc/source/whatsnew/v2.0.2.rst @@ -15,6 +15,8 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :meth:`DataFrame.loc` losing :class:`MultiIndex` name when enlarging object (:issue:`53053`) - Fixed regression in :meth:`DataFrame.to_string` printing a backslash at the end of the first row of data, instead of headers, when the DataFrame doesn't fit the line width (:issue:`53054`) +- Fixed regression in :meth:`MultiIndex.join` returning levels in wrong order (:issue:`53093`) +- .. --------------------------------------------------------------------------- .. _whatsnew_202.bug_fixes: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 890ac084680a1..e2b8400188136 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4877,7 +4877,7 @@ def _wrap_joined_index( mask = lidx == -1 join_idx = self.take(lidx) right = other.take(ridx) - join_index = join_idx.putmask(mask, right) + join_index = join_idx.putmask(mask, right)._sort_levels_monotonic() return join_index.set_names(name) # type: ignore[return-value] else: name = get_op_result_name(self, other) diff --git a/pandas/tests/indexes/multi/test_join.py b/pandas/tests/indexes/multi/test_join.py index 4fff862961920..c5a3512113655 100644 --- a/pandas/tests/indexes/multi/test_join.py +++ b/pandas/tests/indexes/multi/test_join.py @@ -257,3 +257,15 @@ def test_join_dtypes_all_nan(any_numeric_ea_dtype): ] ) tm.assert_index_equal(result, expected) + + +def test_join_index_levels(): + # GH#53093 + midx = midx = MultiIndex.from_tuples([("a", "2019-02-01"), ("a", "2019-02-01")]) + midx2 = MultiIndex.from_tuples([("a", "2019-01-31")]) + result = midx.join(midx2, how="outer") + expected = MultiIndex.from_tuples( + [("a", "2019-01-31"), ("a", "2019-02-01"), ("a", "2019-02-01")] + ) + tm.assert_index_equal(result.levels[1], expected.levels[1]) + tm.assert_index_equal(result, expected) From 291acfb3c5c07040e0e01d09828af7d8becc1918 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 8 May 2023 18:32:21 +0200 Subject: [PATCH 19/41] Backport PR #53118 on branch 2.0.x (REGR: read_sql dropping duplicated columns) (#53136) Backport PR #53118: REGR: read_sql dropping duplicated columns Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.0.2.rst | 1 + pandas/io/sql.py | 4 +++- pandas/tests/io/test_sql.py | 12 ++++++++++++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.2.rst b/doc/source/whatsnew/v2.0.2.rst index 7864791f8bc59..fae0f06beb3a8 100644 --- a/doc/source/whatsnew/v2.0.2.rst +++ b/doc/source/whatsnew/v2.0.2.rst @@ -13,6 +13,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ +- Fixed regression in :func:`read_sql` dropping columns with duplicated column names (:issue:`53117`) - Fixed regression in :meth:`DataFrame.loc` losing :class:`MultiIndex` name when enlarging object (:issue:`53053`) - Fixed regression in :meth:`DataFrame.to_string` printing a backslash at the end of the first row of data, instead of headers, when the DataFrame doesn't fit the line width (:issue:`53054`) - Fixed regression in :meth:`MultiIndex.join` returning levels in wrong order (:issue:`53093`) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index c2393fc7ada06..a627a60ef0691 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -158,7 +158,9 @@ def _convert_arrays_to_dataframe( ArrowExtensionArray(pa.array(arr, from_pandas=True)) for arr in arrays ] if arrays: - return DataFrame(dict(zip(columns, arrays))) + df = DataFrame(dict(zip(list(range(len(columns))), arrays))) + df.columns = columns + return df else: return DataFrame(columns=columns) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index d2fb4a8426cf8..b749a863a3937 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -1496,6 +1496,18 @@ def test_escaped_table_name(self): tm.assert_frame_equal(res, df) + def test_read_sql_duplicate_columns(self): + # GH#53117 + df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": 1}) + df.to_sql("test_table", self.conn, index=False) + + result = pd.read_sql("SELECT a, b, a +1 as a, c FROM test_table;", self.conn) + expected = DataFrame( + [[1, 0.1, 2, 1], [2, 0.2, 3, 1], [3, 0.3, 4, 1]], + columns=["a", "b", "a", "c"], + ) + tm.assert_frame_equal(result, expected) + @pytest.mark.skipif(not SQLALCHEMY_INSTALLED, reason="SQLAlchemy not installed") class TestSQLApi(SQLAlchemyMixIn, _TestSQLApi): From 5962f0ebcb0acfe9987b4a89078c2505ce1a3bae Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 8 May 2023 18:40:06 +0200 Subject: [PATCH 20/41] Backport PR #51895 on branch 2.0.x (BUG: Fix getitem dtype preservation with multiindexes) (#53121) * BUG: Fix getitem dtype preservation with multiindexes (#51895) * BUG/TST fix dtype preservation with multindex * lint * Update pandas/tests/indexing/multiindex/test_multiindex.py Co-authored-by: Joris Van den Bossche * cleanups * switch to iloc, reindex fails in some cases * suggestions from code review * address code review comments Co-Authored-By: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --------- Co-authored-by: Joris Van den Bossche Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> (cherry picked from commit 194b6bb006a48b913b73e176b1210ece54f226a8) * Add whatsnew --------- Co-authored-by: Matt Richards <45483497+m-richards@users.noreply.github.com> --- doc/source/whatsnew/v2.0.2.rst | 1 + pandas/core/frame.py | 14 ++----------- .../indexing/multiindex/test_multiindex.py | 20 +++++++++++++++++++ 3 files changed, 23 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v2.0.2.rst b/doc/source/whatsnew/v2.0.2.rst index fae0f06beb3a8..2fc10cbf0a4a6 100644 --- a/doc/source/whatsnew/v2.0.2.rst +++ b/doc/source/whatsnew/v2.0.2.rst @@ -30,6 +30,7 @@ Bug fixes - Bug in :meth:`DataFrame.convert_dtypes` ignores ``convert_*`` keywords when set to False ``dtype_backend="pyarrow"`` (:issue:`52872`) - Bug in :meth:`Series.describe` treating pyarrow-backed timestamps and timedeltas as categorical data (:issue:`53001`) - Bug in :meth:`pd.array` raising for ``NumPy`` array and ``pa.large_string`` or ``pa.large_binary`` (:issue:`52590`) +- Bug in :meth:`DataFrame.__getitem__` not preserving dtypes for :class:`MultiIndex` partial keys (:issue:`51895`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8be8ed188cce2..7e1d8711aee86 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3816,18 +3816,8 @@ def _getitem_multilevel(self, key): if isinstance(loc, (slice, np.ndarray)): new_columns = self.columns[loc] result_columns = maybe_droplevels(new_columns, key) - if self._is_mixed_type: - result = self.reindex(columns=new_columns) - result.columns = result_columns - else: - new_values = self._values[:, loc] - result = self._constructor( - new_values, index=self.index, columns=result_columns, copy=False - ) - if using_copy_on_write() and isinstance(loc, slice): - result._mgr.add_references(self._mgr) # type: ignore[arg-type] - - result = result.__finalize__(self) + result = self.iloc[:, loc] + result.columns = result_columns # If there is only one column being returned, and its name is # either an empty string, or a tuple with an empty string as its diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index 8e507212976ec..22a6f62f53392 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -6,12 +6,14 @@ import pandas as pd from pandas import ( + CategoricalDtype, DataFrame, Index, MultiIndex, Series, ) import pandas._testing as tm +from pandas.core.arrays.boolean import BooleanDtype class TestMultiIndexBasic: @@ -206,3 +208,21 @@ def test_multiindex_with_na_missing_key(self): ) with pytest.raises(KeyError, match="missing_key"): df[[("missing_key",)]] + + def test_multiindex_dtype_preservation(self): + # GH51261 + columns = MultiIndex.from_tuples([("A", "B")], names=["lvl1", "lvl2"]) + df = DataFrame(["value"], columns=columns).astype("category") + df_no_multiindex = df["A"] + assert isinstance(df_no_multiindex["B"].dtype, CategoricalDtype) + + # geopandas 1763 analogue + df = DataFrame( + [[1, 0], [0, 1]], + columns=[ + ["foo", "foo"], + ["location", "location"], + ["x", "y"], + ], + ).assign(bools=Series([True, False], dtype="boolean")) + assert isinstance(df["bools"].dtype, BooleanDtype) From e33e8ae8c26588fed366af919e4c687390eb9bf9 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 9 May 2023 02:10:23 +0200 Subject: [PATCH 21/41] Backport PR #53027 on branch 2.0.x (CI: Add job to validate conda-forge meta.yaml) (#53146) Backport PR #53027: CI: Add job to validate conda-forge meta.yaml Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- .github/workflows/package-checks.yml | 42 +++++++++++-- ci/meta.yaml | 93 ++++++++++++++++++++++++++++ 2 files changed, 131 insertions(+), 4 deletions(-) create mode 100644 ci/meta.yaml diff --git a/.github/workflows/package-checks.yml b/.github/workflows/package-checks.yml index fa1b5e5d4fba3..6f1fa771a7854 100644 --- a/.github/workflows/package-checks.yml +++ b/.github/workflows/package-checks.yml @@ -14,6 +14,10 @@ on: permissions: contents: read +defaults: + run: + shell: bash -el {0} + jobs: pip: if: ${{ github.event.label.name == 'Build' || contains(github.event.pull_request.labels.*.name, 'Build') || github.event_name == 'push'}} @@ -44,9 +48,39 @@ jobs: run: | python -m pip install --upgrade pip setuptools wheel python-dateutil pytz numpy cython python -m pip install versioneer[toml] - shell: bash -el {0} - name: Pip install with extra - run: | - python -m pip install -e .[${{ matrix.extra }}] --no-build-isolation - shell: bash -el {0} + run: python -m pip install -e .[${{ matrix.extra }}] --no-build-isolation + conda_forge_recipe: + if: ${{ github.event.label.name == 'Build' || contains(github.event.pull_request.labels.*.name, 'Build') || github.event_name == 'push'}} + runs-on: ubuntu-22.04 + strategy: + matrix: + python-version: ['3.9', '3.10', '3.11'] + fail-fast: false + name: Test Conda Forge Recipe - Python ${{ matrix.python-version }} + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-conda-forge-recipe-${{ matrix.python-version }} + cancel-in-progress: true + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Set up Python + uses: mamba-org/provision-with-micromamba@v15 + with: + environment-file: false + environment-name: recipe-test + extra-specs: | + python=${{ matrix.python-version }} + boa + conda-verify + channels: conda-forge + cache-downloads: true + cache-env: true + + - name: Build conda package + run: conda mambabuild ci --no-anaconda-upload --verify --strict-verify --output --output-folder . diff --git a/ci/meta.yaml b/ci/meta.yaml new file mode 100644 index 0000000000000..f02c7eec001fc --- /dev/null +++ b/ci/meta.yaml @@ -0,0 +1,93 @@ +{% set version = "2.0.1" %} + +package: + name: pandas + version: {{ version }} + +source: + git_url: ../.. + +build: + number: 1 + script: + - export PYTHONUNBUFFERED=1 # [ppc64le] + - {{ PYTHON }} -m pip install -vv --no-deps --ignore-installed . # [not unix] + - {{ PYTHON }} -m pip install -vv --no-deps --ignore-installed . --global-option="build_ext" --global-option="-j4" --no-use-pep517 # [unix] + skip: true # [py<39] + +requirements: + build: + - python # [build_platform != target_platform] + - cross-python_{{ target_platform }} # [build_platform != target_platform] + - cython # [build_platform != target_platform] + - numpy # [build_platform != target_platform] + - {{ compiler('c') }} + - {{ compiler('cxx') }} + host: + - python + - pip + - setuptools >=61.0.0 + - cython >=0.29.33,<3 + - numpy >=1.21.6 # [py<311] + - numpy >=1.23.2 # [py>=311] + - versioneer + - tomli # [py<311] + run: + - python + - {{ pin_compatible('numpy') }} + - python-dateutil >=2.8.2 + - pytz >=2020.1 + - python-tzdata >=2022.1 + +test: + imports: + - pandas + commands: + - pip check + # Skip test suite on PyPy as it segfaults there + # xref: https://github.com/conda-forge/pandas-feedstock/issues/148 + # + # Also skip `test_rolling_var_numerical_issues` on `ppc64le` as it is a known test failure. + # xref: https://github.com/conda-forge/pandas-feedstock/issues/149 + {% set markers = ["not clipboard", "not single_cpu", "not db", "not network", "not slow"] %} + {% set markers = markers + ["not arm_slow"] %} # [aarch64 or ppc64le] + {% set extra_args = ["-n=2 -m " + " and ".join(markers)] %} + {% set tests_to_skip = "_not_a_real_test" %} + {% set tests_to_skip = tests_to_skip + " or test_rolling_var_numerical_issues" %} # [ppc64le] + {% set tests_to_skip = tests_to_skip + " or test_std_timedelta64_skipna_false" %} # [ppc64le] + {% set tests_to_skip = tests_to_skip + " or test_value_counts_normalized[M8[ns]]" %} # [ppc64le] + {% set tests_to_skip = tests_to_skip + " or test_to_datetime_format_YYYYMMDD_with_nat" %} # [ppc64le] + {% set tests_to_skip = tests_to_skip + " or (TestReductions and test_median_2d)" %} # [ppc64le] + {% set extra_args = extra_args + ["-k", "not (" + tests_to_skip + ")"] %} + - python -c "import pandas; pandas.test(extra_args={{ extra_args }})" # [python_impl == "cpython"] + requires: + - pip + - pytest >=7.0.0 + - pytest-asyncio >=0.17.0 + - pytest-xdist >=2.2.0 + - pytest-cov + - hypothesis >=6.46.1 + - tomli # [py<311] + +about: + home: http://pandas.pydata.org + license: BSD-3-Clause + license_file: LICENSE + summary: Powerful data structures for data analysis, time series, and statistics + doc_url: https://pandas.pydata.org/docs/ + dev_url: https://github.com/pandas-dev/pandas + +extra: + recipe-maintainers: + - jreback + - jorisvandenbossche + - msarahan + - ocefpaf + - TomAugspurger + - WillAyd + - simonjayhawkins + - mroeschke + - datapythonista + - phofl + - lithomas1 + - marcogorelli From c65fce198f43666e078617c2b797df4940a9c513 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 12 May 2023 22:55:38 +0200 Subject: [PATCH 22/41] Backport PR #53195 on branch 2.0.x (PERF: Performance regression in Groupby.apply with group_keys=True) (#53202) Backport PR #53195: PERF: Performance regression in Groupby.apply with group_keys=True Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.0.2.rst | 1 + pandas/core/reshape/concat.py | 24 ++++++++++++++---------- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v2.0.2.rst b/doc/source/whatsnew/v2.0.2.rst index 2fc10cbf0a4a6..e791e2a58ba5b 100644 --- a/doc/source/whatsnew/v2.0.2.rst +++ b/doc/source/whatsnew/v2.0.2.rst @@ -13,6 +13,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ +- Fixed performance regression in :meth:`GroupBy.apply` (:issue:`53195`) - Fixed regression in :func:`read_sql` dropping columns with duplicated column names (:issue:`53117`) - Fixed regression in :meth:`DataFrame.loc` losing :class:`MultiIndex` name when enlarging object (:issue:`53053`) - Fixed regression in :meth:`DataFrame.to_string` printing a backslash at the end of the first row of data, instead of headers, when the DataFrame doesn't fit the line width (:issue:`53054`) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index bc8f4b97d539a..79f130451a986 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -446,7 +446,7 @@ def __init__( keys = type(keys).from_tuples(clean_keys, names=keys.names) else: name = getattr(keys, "name", None) - keys = Index(clean_keys, name=name) + keys = Index(clean_keys, name=name, dtype=getattr(keys, "dtype", None)) if len(objs) == 0: raise ValueError("All objects passed were None") @@ -743,15 +743,19 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde for hlevel, level in zip(zipped, levels): to_concat = [] - for key, index in zip(hlevel, indexes): - # Find matching codes, include matching nan values as equal. - mask = (isna(level) & isna(key)) | (level == key) - if not mask.any(): - raise ValueError(f"Key {key} not in level {level}") - i = np.nonzero(mask)[0][0] - - to_concat.append(np.repeat(i, len(index))) - codes_list.append(np.concatenate(to_concat)) + if isinstance(hlevel, Index) and hlevel.equals(level): + lens = [len(idx) for idx in indexes] + codes_list.append(np.repeat(np.arange(len(hlevel)), lens)) + else: + for key, index in zip(hlevel, indexes): + # Find matching codes, include matching nan values as equal. + mask = (isna(level) & isna(key)) | (level == key) + if not mask.any(): + raise ValueError(f"Key {key} not in level {level}") + i = np.nonzero(mask)[0][0] + + to_concat.append(np.repeat(i, len(index))) + codes_list.append(np.concatenate(to_concat)) concat_index = _concat_indexes(indexes) From 895b35038a8ab495c0c55791f8489c7ee81ad43b Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 14 May 2023 18:11:28 +0200 Subject: [PATCH 23/41] Backport PR #53218 on branch 2.0.x (FIX typo in deprecation message of `deprecate_kwarg` decorator) (#53222) Backport PR #53218: FIX typo in deprecation message of `deprecate_kwarg` decorator Co-authored-by: Yao Xiao <108576690+Charlie-XIAO@users.noreply.github.com> --- pandas/util/_decorators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/util/_decorators.py b/pandas/util/_decorators.py index 968da7cf60105..d0e393e41c623 100644 --- a/pandas/util/_decorators.py +++ b/pandas/util/_decorators.py @@ -195,7 +195,7 @@ def wrapper(*args, **kwargs) -> Callable[..., Any]: else: new_arg_value = old_arg_value msg = ( - f"the {repr(old_arg_name)}' keyword is deprecated, " + f"the {repr(old_arg_name)} keyword is deprecated, " f"use {repr(new_arg_name)} instead." ) From cc47ec2271a6b433a01a8c275eb3ba53f2f0ea61 Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 14 May 2023 20:50:19 +0200 Subject: [PATCH 24/41] Backport PR #53189 on branch 2.0.x (BUG/CoW: Series.rename not making a lazy copy when passed a scalar) (#53221) * Backport PR #53189: BUG/CoW: Series.rename not making a lazy copy when passed a scalar * Update test_setitem.py --------- Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- doc/source/whatsnew/v2.0.2.rst | 1 + pandas/core/series.py | 12 ++++++++---- pandas/tests/copy_view/test_methods.py | 2 ++ 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v2.0.2.rst b/doc/source/whatsnew/v2.0.2.rst index e791e2a58ba5b..2b07622d70c3f 100644 --- a/doc/source/whatsnew/v2.0.2.rst +++ b/doc/source/whatsnew/v2.0.2.rst @@ -30,6 +30,7 @@ Bug fixes - Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on bitmasks (:issue:`49888`) - Bug in :meth:`DataFrame.convert_dtypes` ignores ``convert_*`` keywords when set to False ``dtype_backend="pyarrow"`` (:issue:`52872`) - Bug in :meth:`Series.describe` treating pyarrow-backed timestamps and timedeltas as categorical data (:issue:`53001`) +- Bug in :meth:`Series.rename` not making a lazy copy when Copy-on-Write is enabled when a scalar is passed to it (:issue:`52450`) - Bug in :meth:`pd.array` raising for ``NumPy`` array and ``pa.large_string`` or ``pa.large_binary`` (:issue:`52590`) - Bug in :meth:`DataFrame.__getitem__` not preserving dtypes for :class:`MultiIndex` partial keys (:issue:`51895`) - diff --git a/pandas/core/series.py b/pandas/core/series.py index 29e02fdc7695d..78f4da4e65196 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1940,7 +1940,9 @@ def to_frame(self, name: Hashable = lib.no_default) -> DataFrame: df = self._constructor_expanddim(mgr) return df.__finalize__(self, method="to_frame") - def _set_name(self, name, inplace: bool = False) -> Series: + def _set_name( + self, name, inplace: bool = False, deep: bool | None = None + ) -> Series: """ Set the Series name. @@ -1949,9 +1951,11 @@ def _set_name(self, name, inplace: bool = False) -> Series: name : str inplace : bool Whether to modify `self` directly or return a copy. + deep : bool|None, default None + Whether to do a deep copy, a shallow copy, or Copy on Write(None) """ inplace = validate_bool_kwarg(inplace, "inplace") - ser = self if inplace else self.copy() + ser = self if inplace else self.copy(deep and not using_copy_on_write()) ser.name = name return ser @@ -4770,7 +4774,7 @@ def rename( index: Renamer | Hashable | None = None, *, axis: Axis | None = None, - copy: bool = True, + copy: bool | None = None, inplace: bool = False, level: Level | None = None, errors: IgnoreRaise = "ignore", @@ -4857,7 +4861,7 @@ def rename( errors=errors, ) else: - return self._set_name(index, inplace=inplace) + return self._set_name(index, inplace=inplace, deep=copy) @Appender( """ diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 1d8f1dea7d478..c220c46bdc8f8 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -135,6 +135,7 @@ def test_methods_copy_keyword( "method", [ lambda ser, copy: ser.rename(index={0: 100}, copy=copy), + lambda ser, copy: ser.rename(None, copy=copy), lambda ser, copy: ser.reindex(index=ser.index, copy=copy), lambda ser, copy: ser.reindex_like(ser, copy=copy), lambda ser, copy: ser.align(ser, copy=copy)[0], @@ -152,6 +153,7 @@ def test_methods_copy_keyword( lambda ser, copy: ser.set_flags(allows_duplicate_labels=False, copy=copy), ], ids=[ + "rename (dict)", "rename", "reindex", "reindex_like", From 7a28cedf4a7e4a816aadf4b43d3b84a12770a383 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 15 May 2023 03:08:46 +0200 Subject: [PATCH 25/41] Backport PR #53213 on branch 2.0.x (FIX preserve dtype with datetime columns of different resolution when merging) (#53228) FIX preserve dtype with datetime columns of different resolution when merging (#53213) (cherry picked from commit 935244a9b9c0fe796de315cd381d6364719bfdc1) Co-authored-by: Guillaume Lemaitre --- doc/source/whatsnew/v2.0.2.rst | 1 + pandas/core/reshape/merge.py | 8 +++++++- pandas/tests/reshape/merge/test_merge.py | 24 ++++++++++++++++++++++++ 3 files changed, 32 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.2.rst b/doc/source/whatsnew/v2.0.2.rst index 2b07622d70c3f..2b488ecec0b11 100644 --- a/doc/source/whatsnew/v2.0.2.rst +++ b/doc/source/whatsnew/v2.0.2.rst @@ -28,6 +28,7 @@ Bug fixes - Bug in :func:`api.interchange.from_dataframe` was raising ``IndexError`` on empty categorical data (:issue:`53077`) - Bug in :func:`api.interchange.from_dataframe` was returning :class:`DataFrame`'s of incorrect sizes when called on slices (:issue:`52824`) - Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on bitmasks (:issue:`49888`) +- Bug in :func:`merge` when merging on datetime columns on different resolutions (:issue:`53200`) - Bug in :meth:`DataFrame.convert_dtypes` ignores ``convert_*`` keywords when set to False ``dtype_backend="pyarrow"`` (:issue:`52872`) - Bug in :meth:`Series.describe` treating pyarrow-backed timestamps and timedeltas as categorical data (:issue:`53001`) - Bug in :meth:`Series.rename` not making a lazy copy when Copy-on-Write is enabled when a scalar is passed to it (:issue:`52450`) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index bb01d551628d3..a0207d492023f 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1401,6 +1401,12 @@ def _maybe_coerce_merge_keys(self) -> None: rk.dtype, DatetimeTZDtype ): raise ValueError(msg) + elif ( + isinstance(lk.dtype, DatetimeTZDtype) + and isinstance(rk.dtype, DatetimeTZDtype) + ) or (lk.dtype.kind == "M" and rk.dtype.kind == "M"): + # allows datetime with different resolutions + continue elif lk_is_object and rk_is_object: continue @@ -2355,7 +2361,7 @@ def _factorize_keys( if isinstance(lk.dtype, DatetimeTZDtype) and isinstance(rk.dtype, DatetimeTZDtype): # Extract the ndarray (UTC-localized) values # Note: we dont need the dtypes to match, as these can still be compared - # TODO(non-nano): need to make sure resolutions match + lk, rk = cast("DatetimeArray", lk)._ensure_matching_resos(rk) lk = cast("DatetimeArray", lk)._ndarray rk = cast("DatetimeArray", rk)._ndarray diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index a7b007f043da9..a4d1bfbaa34be 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -7,6 +7,7 @@ import numpy as np import pytest +import pytz from pandas.core.dtypes.common import ( is_categorical_dtype, @@ -2750,3 +2751,26 @@ def test_merge_arrow_and_numpy_dtypes(dtype): result = df2.merge(df) expected = df2.copy() tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("tzinfo", [None, pytz.timezone("America/Chicago")]) +def test_merge_datetime_different_resolution(tzinfo): + # https://github.com/pandas-dev/pandas/issues/53200 + df1 = DataFrame( + { + "t": [pd.Timestamp(2023, 5, 12, tzinfo=tzinfo, unit="ns")], + "a": [1], + } + ) + df2 = df1.copy() + df2["t"] = df2["t"].dt.as_unit("s") + + expected = DataFrame( + { + "t": [pd.Timestamp(2023, 5, 12, tzinfo=tzinfo)], + "a_x": [1], + "a_y": [1], + } + ) + result = df1.merge(df2, on="t") + tm.assert_frame_equal(result, expected) From 340346c16be2c2c2c659d82dfcb2445f44653cb9 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 15 May 2023 23:51:37 +0200 Subject: [PATCH 26/41] Backport PR #53232 on branch 2.0.x (BUG: sort_values raising for dictionary arrow dtype) (#53241) BUG: sort_values raising for dictionary arrow dtype (#53232) (cherry picked from commit e1b657a7efdc6c8663d6a747d32aa46585ee328b) --- doc/source/whatsnew/v2.0.2.rst | 1 + pandas/core/arrays/arrow/array.py | 10 ++++++++-- pandas/tests/extension/test_arrow.py | 14 ++++++++++++++ 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.0.2.rst b/doc/source/whatsnew/v2.0.2.rst index 2b488ecec0b11..f3432564233a1 100644 --- a/doc/source/whatsnew/v2.0.2.rst +++ b/doc/source/whatsnew/v2.0.2.rst @@ -30,6 +30,7 @@ Bug fixes - Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on bitmasks (:issue:`49888`) - Bug in :func:`merge` when merging on datetime columns on different resolutions (:issue:`53200`) - Bug in :meth:`DataFrame.convert_dtypes` ignores ``convert_*`` keywords when set to False ``dtype_backend="pyarrow"`` (:issue:`52872`) +- Bug in :meth:`DataFrame.sort_values` raising for PyArrow ``dictionary`` dtype (:issue:`53232`) - Bug in :meth:`Series.describe` treating pyarrow-backed timestamps and timedeltas as categorical data (:issue:`53001`) - Bug in :meth:`Series.rename` not making a lazy copy when Copy-on-Write is enabled when a scalar is passed to it (:issue:`52450`) - Bug in :meth:`pd.array` raising for ``NumPy`` array and ``pa.large_string`` or ``pa.large_binary`` (:issue:`52590`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index fb56a98fc87cc..611ef142a72a5 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -269,7 +269,10 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal # GH50430: let pyarrow infer type, then cast scalars = pa.array(scalars, from_pandas=True) if pa_dtype: - scalars = scalars.cast(pa_dtype) + if pa.types.is_dictionary(pa_dtype): + scalars = scalars.dictionary_encode() + else: + scalars = scalars.cast(pa_dtype) arr = cls(scalars) if pa.types.is_duration(scalars.type) and scalars.null_count > 0: # GH52843: upstream bug for duration types when originally @@ -868,7 +871,10 @@ def factorize( else: data = self._data - encoded = data.dictionary_encode(null_encoding=null_encoding) + if pa.types.is_dictionary(data.type): + encoded = data + else: + encoded = data.dictionary_encode(null_encoding=null_encoding) if encoded.length() == 0: indices = np.array([], dtype=np.intp) uniques = type(self)(pa.chunked_array([], type=encoded.type.value_type)) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index abad641819ed1..fdff9dd873fec 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1831,6 +1831,20 @@ def test_searchsorted_with_na_raises(data_for_sorting, as_series): arr.searchsorted(b) +def test_sort_values_dictionary(): + df = pd.DataFrame( + { + "a": pd.Series( + ["x", "y"], dtype=ArrowDtype(pa.dictionary(pa.int32(), pa.string())) + ), + "b": [1, 2], + }, + ) + expected = df.copy() + result = df.sort_values(by=["a", "b"]) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("pat", ["abc", "a[a-z]{2}"]) def test_str_count(pat): ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string())) From a23c15c4b78c80ce0bfbbeee8cf5c6fa4b77711a Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 17 May 2023 22:40:48 +0200 Subject: [PATCH 27/41] Backport PR #53233 on branch 2.0.x (BUG: preserve dtype for right/outer merge of datetime with different resolutions) (#53275) Backport PR #53233: BUG: preserve dtype for right/outer merge of datetime with different resolutions Co-authored-by: Joris Van den Bossche --- pandas/core/reshape/merge.py | 8 +++++ pandas/tests/reshape/merge/test_merge.py | 39 +++++++++++++----------- 2 files changed, 29 insertions(+), 18 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index a0207d492023f..c7d494f19be8e 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1000,6 +1000,14 @@ def _maybe_add_join_keys( else: key_col = Index(lvals).where(~mask_left, rvals) result_dtype = find_common_type([lvals.dtype, rvals.dtype]) + if ( + lvals.dtype.kind == "M" + and rvals.dtype.kind == "M" + and result_dtype.kind == "O" + ): + # TODO(non-nano) Workaround for common_type not dealing + # with different resolutions + result_dtype = key_col.dtype if result._is_label_reference(name): result[name] = Series( diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index a4d1bfbaa34be..ce1c26784475d 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -7,7 +7,6 @@ import numpy as np import pytest -import pytz from pandas.core.dtypes.common import ( is_categorical_dtype, @@ -2753,24 +2752,28 @@ def test_merge_arrow_and_numpy_dtypes(dtype): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("tzinfo", [None, pytz.timezone("America/Chicago")]) -def test_merge_datetime_different_resolution(tzinfo): +@pytest.mark.parametrize("how", ["inner", "left", "outer", "right"]) +@pytest.mark.parametrize("tz", [None, "America/Chicago"]) +def test_merge_datetime_different_resolution(tz, how): # https://github.com/pandas-dev/pandas/issues/53200 - df1 = DataFrame( - { - "t": [pd.Timestamp(2023, 5, 12, tzinfo=tzinfo, unit="ns")], - "a": [1], - } - ) - df2 = df1.copy() + vals = [ + pd.Timestamp(2023, 5, 12, tz=tz), + pd.Timestamp(2023, 5, 13, tz=tz), + pd.Timestamp(2023, 5, 14, tz=tz), + ] + df1 = DataFrame({"t": vals[:2], "a": [1.0, 2.0]}) + df1["t"] = df1["t"].dt.as_unit("ns") + df2 = DataFrame({"t": vals[1:], "b": [1.0, 2.0]}) df2["t"] = df2["t"].dt.as_unit("s") - expected = DataFrame( - { - "t": [pd.Timestamp(2023, 5, 12, tzinfo=tzinfo)], - "a_x": [1], - "a_y": [1], - } - ) - result = df1.merge(df2, on="t") + expected = DataFrame({"t": vals, "a": [1.0, 2.0, np.nan], "b": [np.nan, 1.0, 2.0]}) + expected["t"] = expected["t"].dt.as_unit("ns") + if how == "inner": + expected = expected.iloc[[1]].reset_index(drop=True) + elif how == "left": + expected = expected.iloc[[0, 1]] + elif how == "right": + expected = expected.iloc[[1, 2]].reset_index(drop=True) + + result = df1.merge(df2, on="t", how=how) tm.assert_frame_equal(result, expected) From 8983c5de5976d784f73b376a05646b8cb6f40ee7 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 17 May 2023 13:41:02 -0700 Subject: [PATCH 28/41] Backport PR #53175: Add `np.intc` to `_factorizers` in `pd.merge` (#53276) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Simon Høxbro Hansen --- doc/source/whatsnew/v2.0.2.rst | 2 +- pandas/core/reshape/merge.py | 4 ++++ pandas/tests/reshape/merge/test_merge.py | 4 +++- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.0.2.rst b/doc/source/whatsnew/v2.0.2.rst index f3432564233a1..50313a8ca4796 100644 --- a/doc/source/whatsnew/v2.0.2.rst +++ b/doc/source/whatsnew/v2.0.2.rst @@ -14,11 +14,11 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed performance regression in :meth:`GroupBy.apply` (:issue:`53195`) +- Fixed regression in :func:`merge` on Windows when dtype is ``np.intc`` (:issue:`52451`) - Fixed regression in :func:`read_sql` dropping columns with duplicated column names (:issue:`53117`) - Fixed regression in :meth:`DataFrame.loc` losing :class:`MultiIndex` name when enlarging object (:issue:`53053`) - Fixed regression in :meth:`DataFrame.to_string` printing a backslash at the end of the first row of data, instead of headers, when the DataFrame doesn't fit the line width (:issue:`53054`) - Fixed regression in :meth:`MultiIndex.join` returning levels in wrong order (:issue:`53093`) -- .. --------------------------------------------------------------------------- .. _whatsnew_202.bug_fixes: diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index c7d494f19be8e..4c9cfb476586e 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -123,6 +123,10 @@ np.object_: libhashtable.ObjectFactorizer, } +# See https://github.com/pandas-dev/pandas/issues/52451 +if np.intc is not np.int32: + _factorizers[np.intc] = libhashtable.Int64Factorizer + @Substitution("\nleft : DataFrame or named Series") @Appender(_merge_doc, indents=0) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index ce1c26784475d..9d1346b4ad073 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1463,7 +1463,9 @@ def test_different(self, right_vals): result = merge(left, right, on="A") assert is_object_dtype(result.A.dtype) - @pytest.mark.parametrize("d1", [np.int64, np.int32, np.int16, np.int8, np.uint8]) + @pytest.mark.parametrize( + "d1", [np.int64, np.int32, np.intc, np.int16, np.int8, np.uint8] + ) @pytest.mark.parametrize("d2", [np.int64, np.float64, np.float32, np.float16]) def test_join_multi_dtypes(self, d1, d2): dtype1 = np.dtype(d1) From 456a04102605130ba2ab32aa578e22cc08f49286 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Wed, 17 May 2023 21:41:19 +0100 Subject: [PATCH 29/41] "Backport PR #53257 on branch 2.0.x (BUG: Add AM/PM token support on guess_datetime_format)" (#53277) Backport PR #53257: BUG: Add AM/PM token support on guess_datetime_format --------- Co-authored-by: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> (cherry picked from commit f2de598d9002801e2056b4f8e2f587fd8bb07bc1) Co-authored-by: Julian Badillo --- doc/source/whatsnew/v2.0.2.rst | 1 + pandas/_libs/tslibs/parsing.pyx | 5 +++++ pandas/tests/tools/test_to_datetime.py | 3 +++ pandas/tests/tslibs/test_parsing.py | 6 ++++-- 4 files changed, 13 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.0.2.rst b/doc/source/whatsnew/v2.0.2.rst index 50313a8ca4796..8713979331afd 100644 --- a/doc/source/whatsnew/v2.0.2.rst +++ b/doc/source/whatsnew/v2.0.2.rst @@ -29,6 +29,7 @@ Bug fixes - Bug in :func:`api.interchange.from_dataframe` was returning :class:`DataFrame`'s of incorrect sizes when called on slices (:issue:`52824`) - Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on bitmasks (:issue:`49888`) - Bug in :func:`merge` when merging on datetime columns on different resolutions (:issue:`53200`) +- Bug in :func:`to_datetime` was inferring format to contain ``"%H"`` instead of ``"%I"`` if date contained "AM" / "PM" tokens (:issue:`53147`) - Bug in :meth:`DataFrame.convert_dtypes` ignores ``convert_*`` keywords when set to False ``dtype_backend="pyarrow"`` (:issue:`52872`) - Bug in :meth:`DataFrame.sort_values` raising for PyArrow ``dictionary`` dtype (:issue:`53232`) - Bug in :meth:`Series.describe` treating pyarrow-backed timestamps and timedeltas as categorical data (:issue:`53001`) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 445683968c58f..146e14f622ccd 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -990,6 +990,11 @@ def guess_datetime_format(dt_str: str, bint dayfirst=False) -> str | None: output_format.append(tokens[i]) + # if am/pm token present, replace 24-hour %H, with 12-hour %I + if "%p" in output_format and "%H" in output_format: + i = output_format.index("%H") + output_format[i] = "%I" + guessed_format = "".join(output_format) try: diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 82c2375ffd628..e741fd310eb41 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -2953,6 +2953,9 @@ class TestDatetimeParsingWrappers: ("2005-11-09 10:15", datetime(2005, 11, 9, 10, 15)), ("2005-11-09 08H", datetime(2005, 11, 9, 8, 0)), ("2005/11/09 10:15", datetime(2005, 11, 9, 10, 15)), + ("2005/11/09 10:15:32", datetime(2005, 11, 9, 10, 15, 32)), + ("2005/11/09 10:15:32 AM", datetime(2005, 11, 9, 10, 15, 32)), + ("2005/11/09 10:15:32 PM", datetime(2005, 11, 9, 22, 15, 32)), ("2005/11/09 08H", datetime(2005, 11, 9, 8, 0)), ("Thu Sep 25 10:36:28 2003", datetime(2003, 9, 25, 10, 36, 28)), ("Thu Sep 25 2003", datetime(2003, 9, 25)), diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index 6500afdf87beb..e33c6e37ac0e7 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -201,8 +201,10 @@ def test_parsers_month_freq(date_str, expected): ("2011-12-30T00:00:00.000000+9:0", "%Y-%m-%dT%H:%M:%S.%f%z"), ("2011-12-30T00:00:00.000000+09:", None), ("2011-12-30 00:00:00.000000", "%Y-%m-%d %H:%M:%S.%f"), - ("Tue 24 Aug 2021 01:30:48 AM", "%a %d %b %Y %H:%M:%S %p"), - ("Tuesday 24 Aug 2021 01:30:48 AM", "%A %d %b %Y %H:%M:%S %p"), + ("Tue 24 Aug 2021 01:30:48", "%a %d %b %Y %H:%M:%S"), + ("Tuesday 24 Aug 2021 01:30:48", "%A %d %b %Y %H:%M:%S"), + ("Tue 24 Aug 2021 01:30:48 AM", "%a %d %b %Y %I:%M:%S %p"), + ("Tuesday 24 Aug 2021 01:30:48 AM", "%A %d %b %Y %I:%M:%S %p"), ("27.03.2003 14:55:00.000", "%d.%m.%Y %H:%M:%S.%f"), # GH50317 ], ) From 092ef682ee1e18eebc4e0b0305106128c158f7fe Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 20 May 2023 11:20:50 -0700 Subject: [PATCH 30/41] Backport PR #53309 on branch 2.0.x (CI: Pin Numba<0.57 in ARM build) (#53318) CI: Pin Numba<0.57 in ARM build (#53309) (cherry picked from commit 9d38f40767da08ff46b807fd9565666a7aaf079c) Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- ci/deps/circle-38-arm64.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ci/deps/circle-38-arm64.yaml b/ci/deps/circle-38-arm64.yaml index 7bc71483be34a..8f309b0781457 100644 --- a/ci/deps/circle-38-arm64.yaml +++ b/ci/deps/circle-38-arm64.yaml @@ -33,7 +33,8 @@ dependencies: - jinja2 - lxml - matplotlib>=3.6.1, <3.7.0 - - numba + # test_numba_vs_cython segfaults with numba 0.57 + - numba>=0.55.2, <0.57.0 - numexpr - openpyxl<3.1.1 - odfpy From 2b012f0cc209d715dc4b378d3c6e046c6f2c33a5 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 20 May 2023 11:51:03 -0700 Subject: [PATCH 31/41] Backport PR #53295 on branch 2.0.x (BUG: read_csv raising for arrow engine and parse_dates) (#53317) BUG: read_csv raising for arrow engine and parse_dates (#53295) (cherry picked from commit aaf503784a7cbf6425d681551d1e1e686ce14815) --- doc/source/whatsnew/v2.0.2.rst | 2 +- pandas/io/parsers/base_parser.py | 3 +++ pandas/tests/io/parser/test_parse_dates.py | 20 ++++++++++++++++++++ 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.2.rst b/doc/source/whatsnew/v2.0.2.rst index 8713979331afd..a39d68a2f8ae9 100644 --- a/doc/source/whatsnew/v2.0.2.rst +++ b/doc/source/whatsnew/v2.0.2.rst @@ -29,6 +29,7 @@ Bug fixes - Bug in :func:`api.interchange.from_dataframe` was returning :class:`DataFrame`'s of incorrect sizes when called on slices (:issue:`52824`) - Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on bitmasks (:issue:`49888`) - Bug in :func:`merge` when merging on datetime columns on different resolutions (:issue:`53200`) +- Bug in :func:`read_csv` raising ``OverflowError`` for ``engine="pyarrow"`` and ``parse_dates`` set (:issue:`53295`) - Bug in :func:`to_datetime` was inferring format to contain ``"%H"`` instead of ``"%I"`` if date contained "AM" / "PM" tokens (:issue:`53147`) - Bug in :meth:`DataFrame.convert_dtypes` ignores ``convert_*`` keywords when set to False ``dtype_backend="pyarrow"`` (:issue:`52872`) - Bug in :meth:`DataFrame.sort_values` raising for PyArrow ``dictionary`` dtype (:issue:`53232`) @@ -37,7 +38,6 @@ Bug fixes - Bug in :meth:`pd.array` raising for ``NumPy`` array and ``pa.large_string`` or ``pa.large_binary`` (:issue:`52590`) - Bug in :meth:`DataFrame.__getitem__` not preserving dtypes for :class:`MultiIndex` partial keys (:issue:`51895`) - - .. --------------------------------------------------------------------------- .. _whatsnew_202.other: diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index f354fe9a53b48..2db759719fcb4 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -1120,6 +1120,9 @@ def unpack_if_single_element(arg): return arg def converter(*date_cols, col: Hashable): + if len(date_cols) == 1 and date_cols[0].dtype.kind in "Mm": + return date_cols[0] + if date_parser is lib.no_default: strs = parsing.concat_date_cols(date_cols) date_fmt = ( diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 8c3474220cde8..94f4066ea1cb2 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -2218,3 +2218,23 @@ def test_parse_dates_dict_format_index(all_parsers): index=Index([Timestamp("2019-12-31"), Timestamp("2020-12-31")], name="a"), ) tm.assert_frame_equal(result, expected) + + +def test_parse_dates_arrow_engine(all_parsers): + # GH#53295 + parser = all_parsers + data = """a,b +2000-01-01 00:00:00,1 +2000-01-01 00:00:01,1""" + + result = parser.read_csv(StringIO(data), parse_dates=["a"]) + expected = DataFrame( + { + "a": [ + Timestamp("2000-01-01 00:00:00"), + Timestamp("2000-01-01 00:00:01"), + ], + "b": 1, + } + ) + tm.assert_frame_equal(result, expected) From e821e1b0b2a9d8577906092030554bfc86c0d562 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 23 May 2023 18:54:52 -0700 Subject: [PATCH 32/41] Backport PR #53343: REF: Use np.result_type instead of np.find_common_type (#53359) * Backport PR #53343: REF: Use np.result_type instead of np.find_common_type * Update pandas/core/internals/concat.py * Update pandas/core/arrays/sparse/dtype.py * Update pandas/core/dtypes/concat.py --- pandas/core/algorithms.py | 3 ++- pandas/core/arrays/sparse/dtype.py | 6 ++++-- pandas/core/dtypes/cast.py | 28 ++++++++++++++++++++++++++- pandas/core/dtypes/concat.py | 3 +++ pandas/core/internals/concat.py | 3 ++- pandas/tests/dtypes/test_inference.py | 4 +--- 6 files changed, 39 insertions(+), 8 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 29009c1627cfb..d312612cdc680 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -35,6 +35,7 @@ from pandas.core.dtypes.cast import ( construct_1d_object_array_from_listlike, infer_dtype_from_array, + np_find_common_type, ) from pandas.core.dtypes.common import ( ensure_float64, @@ -522,7 +523,7 @@ def f(c, v): f = np.in1d else: - common = np.find_common_type([values.dtype, comps_array.dtype], []) + common = np_find_common_type(values.dtype, comps_array.dtype) values = values.astype(common, copy=False) comps_array = comps_array.astype(common, copy=False) f = htable.ismember diff --git a/pandas/core/arrays/sparse/dtype.py b/pandas/core/arrays/sparse/dtype.py index c7a44d3606fa6..185ed2911c11e 100644 --- a/pandas/core/arrays/sparse/dtype.py +++ b/pandas/core/arrays/sparse/dtype.py @@ -400,6 +400,8 @@ def _subtype_with_str(self): def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: # TODO for now only handle SparseDtypes and numpy dtypes => extend # with other compatible extension dtypes + from pandas.core.dtypes.cast import np_find_common_type + if any( isinstance(x, ExtensionDtype) and not isinstance(x, SparseDtype) for x in dtypes @@ -420,5 +422,5 @@ def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: stacklevel=find_stack_level(), ) - np_dtypes = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes] - return SparseDtype(np.find_common_type(np_dtypes, []), fill_value=fill_value) + np_dtypes = (x.subtype if isinstance(x, SparseDtype) else x for x in dtypes) + return SparseDtype(np_find_common_type(*np_dtypes), fill_value=fill_value) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 05e108099fd5a..156c7c67c7011 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1368,6 +1368,32 @@ def common_dtype_categorical_compat( return dtype +def np_find_common_type(*dtypes: np.dtype) -> np.dtype: + """ + np.find_common_type implementation pre-1.25 deprecation using np.result_type + https://github.com/pandas-dev/pandas/pull/49569#issuecomment-1308300065 + + Parameters + ---------- + dtypes : np.dtypes + + Returns + ------- + np.dtype + """ + try: + common_dtype = np.result_type(*dtypes) + if common_dtype.kind in "mMSU": + # NumPy promotion currently (1.25) misbehaves for for times and strings, + # so fall back to object (find_common_dtype did unless there + # was only one dtype) + common_dtype = np.dtype("O") + + except TypeError: + common_dtype = np.dtype("O") + return common_dtype + + @overload def find_common_type(types: list[np.dtype]) -> np.dtype: ... @@ -1435,7 +1461,7 @@ def find_common_type(types): if is_integer_dtype(t) or is_float_dtype(t) or is_complex_dtype(t): return np.dtype("object") - return np.find_common_type(types, []) + return np_find_common_type(*types) def construct_2d_arraylike_from_scalar( diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 28bc849088d5f..709563020778e 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -13,6 +13,7 @@ from pandas.core.dtypes.cast import ( common_dtype_categorical_compat, find_common_type, + np_find_common_type, ) from pandas.core.dtypes.common import is_dtype_equal from pandas.core.dtypes.dtypes import ( @@ -110,6 +111,8 @@ def is_nonempty(x) -> bool: # coerce to object to_concat = [x.astype("object") for x in to_concat] kinds = {"o"} + else: + target_dtype = np_find_common_type(*dtypes) result = np.concatenate(to_concat, axis=axis) if "b" in kinds and result.dtype.kind in ["i", "u", "f"]: diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index a33ce8fd5c459..e9cf7a151b627 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -28,6 +28,7 @@ from pandas.core.dtypes.cast import ( ensure_dtype_can_hold_na, find_common_type, + np_find_common_type, ) from pandas.core.dtypes.common import ( is_1d_only_ea_dtype, @@ -144,7 +145,7 @@ def concat_arrays(to_concat: list) -> ArrayLike: target_dtype = to_concat_no_proxy[0].dtype elif all(x.kind in ["i", "u", "b"] and isinstance(x, np.dtype) for x in dtypes): # GH#42092 - target_dtype = np.find_common_type(list(dtypes), []) + target_dtype = np_find_common_type(*dtypes) else: target_dtype = find_common_type([arr.dtype for arr in to_concat_no_proxy]) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 650eb033dcd9e..a65fa240bf7f3 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -1012,9 +1012,7 @@ def test_maybe_convert_objects_itemsize(self, data0, data1): data = [data0, data1] arr = np.array(data, dtype="object") - common_kind = np.find_common_type( - [type(data0), type(data1)], scalar_types=[] - ).kind + common_kind = np.result_type(type(data0), type(data1)).kind kind0 = "python" if not hasattr(data0, "dtype") else data0.dtype.kind kind1 = "python" if not hasattr(data1, "dtype") else data1.dtype.kind if kind0 != "python" and kind1 != "python": From 584504c5f54a5630891077bfce2fba12ac651e00 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 24 May 2023 09:29:17 -0700 Subject: [PATCH 33/41] Backport PR #53364 on branch 2.0.x (Update whatsnew) (#53371) Backport PR #53364: Update whatsnew Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index b912fd5ee9d2d..e0fc6b90708dc 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -944,7 +944,7 @@ Removal of prior version deprecations/changes - Disallow passing non-keyword arguments to :meth:`DataFrame.where` and :meth:`Series.where` except for ``cond`` and ``other`` (:issue:`41523`) - Disallow passing non-keyword arguments to :meth:`Series.set_axis` and :meth:`DataFrame.set_axis` except for ``labels`` (:issue:`41491`) - Disallow passing non-keyword arguments to :meth:`Series.rename_axis` and :meth:`DataFrame.rename_axis` except for ``mapper`` (:issue:`47587`) -- Disallow passing non-keyword arguments to :meth:`Series.clip` and :meth:`DataFrame.clip` (:issue:`41511`) +- Disallow passing non-keyword arguments to :meth:`Series.clip` and :meth:`DataFrame.clip` except ``lower`` and ``upper`` (:issue:`41511`) - Disallow passing non-keyword arguments to :meth:`Series.bfill`, :meth:`Series.ffill`, :meth:`DataFrame.bfill` and :meth:`DataFrame.ffill` (:issue:`41508`) - Disallow passing non-keyword arguments to :meth:`DataFrame.replace`, :meth:`Series.replace` except for ``to_replace`` and ``value`` (:issue:`47587`) - Disallow passing non-keyword arguments to :meth:`DataFrame.sort_values` except for ``by`` (:issue:`41505`) From c8cd0277f8f889c5db7463ef7f36b495b5c9de69 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 24 May 2023 13:09:27 -0700 Subject: [PATCH 34/41] Backport PR #53344: BUG: Correct .type for pyarrow.map_ and pyarrow.struct types (#53363) * Backport PR #53344: BUG: Correct .type for pyarrow.map_ and pyarrow.struct types * Fix conflict --- doc/source/whatsnew/v2.0.2.rst | 1 + pandas/core/arrays/arrow/dtype.py | 2 ++ pandas/tests/extension/test_arrow.py | 3 ++- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.2.rst b/doc/source/whatsnew/v2.0.2.rst index a39d68a2f8ae9..373612762cb69 100644 --- a/doc/source/whatsnew/v2.0.2.rst +++ b/doc/source/whatsnew/v2.0.2.rst @@ -25,6 +25,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ +- Bug in :class:`.arrays.ArrowExtensionArray` incorrectly assigning ``dict`` instead of ``list`` for ``.type`` with ``pyarrow.map_`` and raising a ``NotImplementedError`` with ``pyarrow.struct`` (:issue:`53328`) - Bug in :func:`api.interchange.from_dataframe` was raising ``IndexError`` on empty categorical data (:issue:`53077`) - Bug in :func:`api.interchange.from_dataframe` was returning :class:`DataFrame`'s of incorrect sizes when called on slices (:issue:`52824`) - Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on bitmasks (:issue:`49888`) diff --git a/pandas/core/arrays/arrow/dtype.py b/pandas/core/arrays/arrow/dtype.py index 20a9902f3bc90..7d4fbb788cc9c 100644 --- a/pandas/core/arrays/arrow/dtype.py +++ b/pandas/core/arrays/arrow/dtype.py @@ -140,6 +140,8 @@ def type(self): elif pa.types.is_list(pa_type) or pa.types.is_large_list(pa_type): return list elif pa.types.is_map(pa_type): + return list + elif pa.types.is_struct(pa_type): return dict elif pa.types.is_null(pa_type): # TODO: None? pd.NA? pa.null? diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index fdff9dd873fec..8907137c71844 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1580,7 +1580,8 @@ def test_mode_dropna_false_mode_na(data): [pa.large_string(), str], [pa.list_(pa.int64()), list], [pa.large_list(pa.int64()), list], - [pa.map_(pa.string(), pa.int64()), dict], + [pa.map_(pa.string(), pa.int64()), list], + [pa.struct([("f1", pa.int8()), ("f2", pa.string())]), dict], [pa.dictionary(pa.int64(), pa.int64()), CategoricalDtypeType], ], ) From 258e55e07b6ab5eca4c3557587b65f2a2cce88f5 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 24 May 2023 13:10:11 -0700 Subject: [PATCH 35/41] FIX: Ignore typing and numba warnings in 2.0.x (#53372) --- doc/source/user_guide/window.rst | 2 ++ doc/source/whatsnew/v2.0.2.rst | 1 + pandas/core/_numba/kernels/mean_.py | 4 ++-- pandas/core/_numba/kernels/sum_.py | 4 ++-- pandas/core/_numba/kernels/var_.py | 4 ++-- pandas/core/groupby/groupby.py | 4 ++-- 6 files changed, 11 insertions(+), 8 deletions(-) diff --git a/doc/source/user_guide/window.rst b/doc/source/user_guide/window.rst index e08fa81c5fa09..4e7733e25fa88 100644 --- a/doc/source/user_guide/window.rst +++ b/doc/source/user_guide/window.rst @@ -89,6 +89,7 @@ For example, a `weighted mean = min_periods and nobs > 0: diff --git a/pandas/core/_numba/kernels/sum_.py b/pandas/core/_numba/kernels/sum_.py index 056897189fe67..eb8846b1fa50a 100644 --- a/pandas/core/_numba/kernels/sum_.py +++ b/pandas/core/_numba/kernels/sum_.py @@ -92,7 +92,7 @@ def sliding_sum( sum_x, compensation_add, num_consecutive_same_value, - prev_value, + prev_value, # pyright: ignore[reportGeneralTypeIssues] ) else: for j in range(start[i - 1], s): @@ -115,7 +115,7 @@ def sliding_sum( sum_x, compensation_add, num_consecutive_same_value, - prev_value, + prev_value, # pyright: ignore[reportGeneralTypeIssues] ) if nobs == 0 == min_periods: diff --git a/pandas/core/_numba/kernels/var_.py b/pandas/core/_numba/kernels/var_.py index d3243f4928dca..2c4559ddc2121 100644 --- a/pandas/core/_numba/kernels/var_.py +++ b/pandas/core/_numba/kernels/var_.py @@ -110,7 +110,7 @@ def sliding_var( ssqdm_x, compensation_add, num_consecutive_same_value, - prev_value, + prev_value, # pyright: ignore[reportGeneralTypeIssues] ) else: for j in range(start[i - 1], s): @@ -135,7 +135,7 @@ def sliding_var( ssqdm_x, compensation_add, num_consecutive_same_value, - prev_value, + prev_value, # pyright: ignore[reportGeneralTypeIssues] ) if nobs >= min_periods and nobs > ddof: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index dc7dff74369bb..42b7fd9b635df 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3074,12 +3074,12 @@ def _nth( # error: No overload variant of "where" matches argument types # "Any", "NAType", "Any" values = np.where(nulls, NA, grouper) # type: ignore[call-overload] - grouper = Index(values, dtype="Int64") + grouper = Index(values, dtype="Int64") # type: ignore[assignment] else: # create a grouper with the original parameters, but on dropped # object - grouper, _, _ = get_grouper( + grouper, _, _ = get_grouper( # type: ignore[assignment] dropped, key=self.keys, axis=self.axis, From 8fc7924fb96e9e355d02c96f63bf7ab64cf98b18 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 25 May 2023 12:05:55 -0700 Subject: [PATCH 36/41] Backport PR #53382 on branch 2.0.x (BUG: convert_dtypes(dtype_backend="pyarrow") losing tz for tz-aware dtypes) (#53388) Backport PR #53382: BUG: convert_dtypes(dtype_backend="pyarrow") losing tz for tz-aware dtypes Co-authored-by: Luke Manley --- doc/source/whatsnew/v2.0.2.rst | 1 + pandas/core/arrays/arrow/array.py | 3 +++ pandas/core/dtypes/cast.py | 4 +++- pandas/tests/frame/methods/test_convert_dtypes.py | 13 ++++++++++++- 4 files changed, 19 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.0.2.rst b/doc/source/whatsnew/v2.0.2.rst index 1ebb512776556..b23f92cc51f1c 100644 --- a/doc/source/whatsnew/v2.0.2.rst +++ b/doc/source/whatsnew/v2.0.2.rst @@ -33,6 +33,7 @@ Bug fixes - Bug in :func:`read_csv` raising ``OverflowError`` for ``engine="pyarrow"`` and ``parse_dates`` set (:issue:`53295`) - Bug in :func:`to_datetime` was inferring format to contain ``"%H"`` instead of ``"%I"`` if date contained "AM" / "PM" tokens (:issue:`53147`) - Bug in :meth:`DataFrame.convert_dtypes` ignores ``convert_*`` keywords when set to False ``dtype_backend="pyarrow"`` (:issue:`52872`) +- Bug in :meth:`DataFrame.convert_dtypes` losing timezone for tz-aware dtypes and ``dtype_backend="pyarrow"`` (:issue:`53382`) - Bug in :meth:`DataFrame.sort_values` raising for PyArrow ``dictionary`` dtype (:issue:`53232`) - Bug in :meth:`Series.describe` treating pyarrow-backed timestamps and timedeltas as categorical data (:issue:`53001`) - Bug in :meth:`Series.rename` not making a lazy copy when Copy-on-Write is enabled when a scalar is passed to it (:issue:`52450`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 611ef142a72a5..445ec36135d5f 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -53,6 +53,7 @@ is_object_dtype, is_scalar, ) +from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.missing import isna from pandas.core import roperator @@ -168,6 +169,8 @@ def to_pyarrow_type( return dtype.pyarrow_dtype elif isinstance(dtype, pa.DataType): return dtype + elif isinstance(dtype, DatetimeTZDtype): + return pa.timestamp(dtype.unit, dtype.tz) elif dtype: try: # Accepts python types too diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 156c7c67c7011..2dbd9465be3c6 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1134,7 +1134,9 @@ def convert_dtypes( and not isinstance(inferred_dtype, StringDtype) ) ): - if isinstance(inferred_dtype, PandasExtensionDtype): + if isinstance(inferred_dtype, PandasExtensionDtype) and not isinstance( + inferred_dtype, DatetimeTZDtype + ): base_dtype = inferred_dtype.base elif isinstance(inferred_dtype, (BaseMaskedDtype, ArrowDtype)): base_dtype = inferred_dtype.numpy_dtype diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index a749cd11df4f7..2adee158379bb 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -53,7 +53,8 @@ def test_pyarrow_dtype_backend(self): "c": pd.Series([True, False, None], dtype=np.dtype("O")), "d": pd.Series([np.nan, 100.5, 200], dtype=np.dtype("float")), "e": pd.Series(pd.date_range("2022", periods=3)), - "f": pd.Series(pd.timedelta_range("1D", periods=3)), + "f": pd.Series(pd.date_range("2022", periods=3, tz="UTC").as_unit("s")), + "g": pd.Series(pd.timedelta_range("1D", periods=3)), } ) result = df.convert_dtypes(dtype_backend="pyarrow") @@ -76,6 +77,16 @@ def test_pyarrow_dtype_backend(self): ) ), "f": pd.arrays.ArrowExtensionArray( + pa.array( + [ + datetime.datetime(2022, 1, 1), + datetime.datetime(2022, 1, 2), + datetime.datetime(2022, 1, 3), + ], + type=pa.timestamp(unit="s", tz="UTC"), + ) + ), + "g": pd.arrays.ArrowExtensionArray( pa.array( [ datetime.timedelta(1), From 776f43a7131511bc12ffe7eca039f74fa48ffb04 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 27 May 2023 08:20:56 -0700 Subject: [PATCH 37/41] Backport PR #53411 on branch 2.0.x (DOC: Fix tooltips and captions label) (#53412) Backport PR #53411: DOC: Fix tooltips and captions label Co-authored-by: Marc Garcia --- doc/source/user_guide/style.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb index 79596c946c068..15ea0066156b5 100644 --- a/doc/source/user_guide/style.ipynb +++ b/doc/source/user_guide/style.ipynb @@ -520,7 +520,7 @@ "\n", "*New in version 1.2.0*\n", "\n", - "The [.set_td_classes()][tdclass] method accepts a DataFrame with matching indices and columns to the underlying [Styler][styler]'s DataFrame. That DataFrame will contain strings as css-classes to add to individual data cells: the `` elements of the ``. Rather than use external CSS we will create our classes internally and add them to table style. We will save adding the borders until the [section on tooltips](#Tooltips).\n", + "The [.set_td_classes()][tdclass] method accepts a DataFrame with matching indices and columns to the underlying [Styler][styler]'s DataFrame. That DataFrame will contain strings as css-classes to add to individual data cells: the `
` elements of the ``. Rather than use external CSS we will create our classes internally and add them to table style. We will save adding the borders until the [section on tooltips](#Tooltips-and-Captions).\n", "\n", "[tdclass]: ../reference/api/pandas.io.formats.style.Styler.set_td_classes.rst\n", "[styler]: ../reference/api/pandas.io.formats.style.Styler.rst" From 12fa6c25f17d0bf7e1e9f3cfae910df663e8a93e Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sat, 27 May 2023 23:30:03 -0700 Subject: [PATCH 38/41] Backport PR #53393 on branch 2.0.x (DOC: Prepare release notes for 2.0.2) (#53396) Backport PR #53393: DOC: Prepare release notes for 2.0.2 Co-authored-by: Marc Garcia --- doc/source/whatsnew/v2.0.2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.2.rst b/doc/source/whatsnew/v2.0.2.rst index b23f92cc51f1c..b5051a10d9aa3 100644 --- a/doc/source/whatsnew/v2.0.2.rst +++ b/doc/source/whatsnew/v2.0.2.rst @@ -1,6 +1,6 @@ .. _whatsnew_202: -What's new in 2.0.2 (May ..., 2023) +What's new in 2.0.2 (May 26, 2023) ----------------------------------- These are the changes in pandas 2.0.2. See :ref:`release` for a full changelog From f0d09989403a8f011dbeb7e50afaa6a13358ec08 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 28 May 2023 02:29:31 -0700 Subject: [PATCH 39/41] Backport PR #53413 on branch 2.0.x (DOC: Update release date for 2.0.2) (#53424) Backport PR #53413: DOC: Update release date for 2.0.2 Co-authored-by: Marc Garcia --- doc/source/whatsnew/v2.0.2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.2.rst b/doc/source/whatsnew/v2.0.2.rst index b5051a10d9aa3..7a9ee18c72957 100644 --- a/doc/source/whatsnew/v2.0.2.rst +++ b/doc/source/whatsnew/v2.0.2.rst @@ -1,6 +1,6 @@ .. _whatsnew_202: -What's new in 2.0.2 (May 26, 2023) +What's new in 2.0.2 (May 29, 2023) ----------------------------------- These are the changes in pandas 2.0.2. See :ref:`release` for a full changelog From 049539b700d826f384dc23a8edcf95331ecb7147 Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Sun, 28 May 2023 22:59:24 +0400 Subject: [PATCH 40/41] DOC: Final clean up of release notes for 2.0.2 (#53428) --- doc/source/whatsnew/v2.0.2.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.2.rst b/doc/source/whatsnew/v2.0.2.rst index 7a9ee18c72957..81272e5ddb5bc 100644 --- a/doc/source/whatsnew/v2.0.2.rst +++ b/doc/source/whatsnew/v2.0.2.rst @@ -39,7 +39,6 @@ Bug fixes - Bug in :meth:`Series.rename` not making a lazy copy when Copy-on-Write is enabled when a scalar is passed to it (:issue:`52450`) - Bug in :meth:`pd.array` raising for ``NumPy`` array and ``pa.large_string`` or ``pa.large_binary`` (:issue:`52590`) - Bug in :meth:`DataFrame.__getitem__` not preserving dtypes for :class:`MultiIndex` partial keys (:issue:`51895`) -- .. --------------------------------------------------------------------------- .. _whatsnew_202.other: From 965ceca9fd796940050d6fc817707bba1c4f9bff Mon Sep 17 00:00:00 2001 From: Pandas Development Team Date: Mon, 29 May 2023 01:02:48 +0400 Subject: [PATCH 41/41] RLS: 2.0.2