diff --git a/.github/actions/run-tests/action.yml b/.github/actions/run-tests/action.yml
index 2a7601f196ec4..fd7c3587f2254 100644
--- a/.github/actions/run-tests/action.yml
+++ b/.github/actions/run-tests/action.yml
@@ -7,7 +7,7 @@ runs:
shell: bash -el {0}
- name: Publish test results
- uses: actions/upload-artifact@v2
+ uses: actions/upload-artifact@v3
with:
name: Test results
path: test-data.xml
@@ -19,7 +19,7 @@ runs:
if: failure()
- name: Upload coverage to Codecov
- uses: codecov/codecov-action@v2
+ uses: codecov/codecov-action@v3
with:
flags: unittests
name: codecov-pandas
diff --git a/.github/actions/setup-conda/action.yml b/.github/actions/setup-conda/action.yml
index b667075e87144..8aa417c1d8fd4 100644
--- a/.github/actions/setup-conda/action.yml
+++ b/.github/actions/setup-conda/action.yml
@@ -13,7 +13,7 @@ runs:
using: composite
steps:
- name: Install ${{ inputs.environment-file }}
- uses: mamba-org/provision-with-micromamba@v12
+ uses: mamba-org/provision-with-micromamba@v15
with:
environment-file: ${{ inputs.environment-file }}
environment-name: ${{ inputs.environment-name }}
diff --git a/.github/workflows/32-bit-linux.yml b/.github/workflows/32-bit-linux.yml
deleted file mode 100644
index 08026a5fd637f..0000000000000
--- a/.github/workflows/32-bit-linux.yml
+++ /dev/null
@@ -1,58 +0,0 @@
-name: 32 Bit Linux
-
-on:
- push:
- branches:
- - main
- - 2.0.x
- pull_request:
- branches:
- - main
- - 2.0.x
- paths-ignore:
- - "doc/**"
-
-permissions:
- contents: read
-
-jobs:
- pytest:
- runs-on: ubuntu-22.04
- steps:
- - name: Checkout
- uses: actions/checkout@v3
- with:
- fetch-depth: 0
-
- - name: Run 32-bit manylinux2014 Docker Build / Tests
- run: |
- # Without this (line 34), versioneer will not be able to determine the pandas version.
- # This is because of a security update to git that blocks it from reading the config folder if
- # it is not owned by the current user. We hit this since the "mounted" folder is not hit by the
- # Docker container.
- # xref https://github.com/pypa/manylinux/issues/1309
- docker pull quay.io/pypa/manylinux2014_i686
- docker run --platform linux/386 -v $(pwd):/pandas quay.io/pypa/manylinux2014_i686 \
- /bin/bash -xc "cd pandas && \
- git config --global --add safe.directory /pandas && \
- /opt/python/cp38-cp38/bin/python -m venv ~/virtualenvs/pandas-dev && \
- . ~/virtualenvs/pandas-dev/bin/activate && \
- python -m pip install --no-deps -U pip wheel 'setuptools<60.0.0' && \
- python -m pip install versioneer[toml] && \
- python -m pip install cython numpy python-dateutil pytz pytest>=7.0.0 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.34.2 && \
- python setup.py build_ext -q -j1 && \
- python -m pip install --no-build-isolation --no-use-pep517 -e . && \
- python -m pip list && \
- export PANDAS_CI=1 && \
- pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml"
-
- - name: Publish test results for Python 3.8-32 bit full Linux
- uses: actions/upload-artifact@v3
- with:
- name: Test results
- path: test-data.xml
- if: failure()
- concurrency:
- # https://github.community/t/concurrecy-not-work-for-push/183068/7
- group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-32bit
- cancel-in-progress: true
diff --git a/.github/workflows/macos-windows.yml b/.github/workflows/macos-windows.yml
deleted file mode 100644
index fd6560d61b160..0000000000000
--- a/.github/workflows/macos-windows.yml
+++ /dev/null
@@ -1,59 +0,0 @@
-name: Windows-macOS
-
-on:
- push:
- branches:
- - main
- - 2.0.x
- pull_request:
- branches:
- - main
- - 2.0.x
- paths-ignore:
- - "doc/**"
-
-env:
- PANDAS_CI: 1
- PYTEST_TARGET: pandas
- PATTERN: "not slow and not db and not network and not single_cpu"
-
-permissions:
- contents: read
-
-jobs:
- pytest:
- defaults:
- run:
- shell: bash -el {0}
- timeout-minutes: 180
- strategy:
- matrix:
- os: [macos-latest, windows-latest]
- env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml, actions-311.yaml]
- fail-fast: false
- runs-on: ${{ matrix.os }}
- name: ${{ format('{0} {1}', matrix.os, matrix.env_file) }}
- concurrency:
- # https://github.community/t/concurrecy-not-work-for-push/183068/7
- group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.os }}
- cancel-in-progress: true
- env:
- # GH 47443: PYTEST_WORKERS > 1 crashes Windows builds with memory related errors
- PYTEST_WORKERS: ${{ matrix.os == 'macos-latest' && 'auto' || '1' }}
-
- steps:
- - name: Checkout
- uses: actions/checkout@v3
- with:
- fetch-depth: 0
-
- - name: Set up Conda
- uses: ./.github/actions/setup-conda
- with:
- environment-file: ci/deps/${{ matrix.env_file }}
-
- - name: Build Pandas
- uses: ./.github/actions/build_pandas
-
- - name: Test
- uses: ./.github/actions/run-tests
diff --git a/.github/workflows/package-checks.yml b/.github/workflows/package-checks.yml
index fa1b5e5d4fba3..6f1fa771a7854 100644
--- a/.github/workflows/package-checks.yml
+++ b/.github/workflows/package-checks.yml
@@ -14,6 +14,10 @@ on:
permissions:
contents: read
+defaults:
+ run:
+ shell: bash -el {0}
+
jobs:
pip:
if: ${{ github.event.label.name == 'Build' || contains(github.event.pull_request.labels.*.name, 'Build') || github.event_name == 'push'}}
@@ -44,9 +48,39 @@ jobs:
run: |
python -m pip install --upgrade pip setuptools wheel python-dateutil pytz numpy cython
python -m pip install versioneer[toml]
- shell: bash -el {0}
- name: Pip install with extra
- run: |
- python -m pip install -e .[${{ matrix.extra }}] --no-build-isolation
- shell: bash -el {0}
+ run: python -m pip install -e .[${{ matrix.extra }}] --no-build-isolation
+ conda_forge_recipe:
+ if: ${{ github.event.label.name == 'Build' || contains(github.event.pull_request.labels.*.name, 'Build') || github.event_name == 'push'}}
+ runs-on: ubuntu-22.04
+ strategy:
+ matrix:
+ python-version: ['3.9', '3.10', '3.11']
+ fail-fast: false
+ name: Test Conda Forge Recipe - Python ${{ matrix.python-version }}
+ concurrency:
+ # https://github.community/t/concurrecy-not-work-for-push/183068/7
+ group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-conda-forge-recipe-${{ matrix.python-version }}
+ cancel-in-progress: true
+ steps:
+ - name: Checkout
+ uses: actions/checkout@v3
+ with:
+ fetch-depth: 0
+
+ - name: Set up Python
+ uses: mamba-org/provision-with-micromamba@v15
+ with:
+ environment-file: false
+ environment-name: recipe-test
+ extra-specs: |
+ python=${{ matrix.python-version }}
+ boa
+ conda-verify
+ channels: conda-forge
+ cache-downloads: true
+ cache-env: true
+
+ - name: Build conda package
+ run: conda mambabuild ci --no-anaconda-upload --verify --strict-verify --output --output-folder .
diff --git a/.github/workflows/python-dev.yml b/.github/workflows/python-dev.yml
deleted file mode 100644
index 39b0439e2d6f4..0000000000000
--- a/.github/workflows/python-dev.yml
+++ /dev/null
@@ -1,95 +0,0 @@
-# This workflow may or may not run depending on the state of the next
-# unreleased Python version. DO NOT DELETE IT.
-#
-# In general, this file will remain frozen(present, but not running) until:
-# - The next unreleased Python version has released beta 1
-# - This version should be available on GitHub Actions.
-# - Our required build/runtime dependencies(numpy, pytz, Cython, python-dateutil)
-# support that unreleased Python version.
-# To unfreeze, comment out the ``if: false`` condition, and make sure you update
-# the name of the workflow and Python version in actions/setup-python to: '3.12-dev'
-#
-# After it has been unfrozen, this file should remain unfrozen(present, and running) until:
-# - The next Python version has been officially released.
-# OR
-# - Most/All of our optional dependencies support Python 3.11 AND
-# - The next Python version has released a rc(we are guaranteed a stable ABI).
-# To freeze this file, uncomment out the ``if: false`` condition, and migrate the jobs
-# to the corresponding posix/windows-macos/sdist etc. workflows.
-# Feel free to modify this comment as necessary.
-
-name: Python Dev
-
-on:
- push:
- branches:
- - main
- - 2.0.x
- - None
- pull_request:
- branches:
- - main
- - 2.0.x
- - None
- paths-ignore:
- - "doc/**"
-
-env:
- PYTEST_WORKERS: "auto"
- PANDAS_CI: 1
- PATTERN: "not slow and not network and not clipboard and not single_cpu"
- COVERAGE: true
- PYTEST_TARGET: pandas
-
-permissions:
- contents: read
-
-jobs:
- build:
- if: false # Uncomment this to freeze the workflow, comment it to unfreeze
- runs-on: ${{ matrix.os }}
- strategy:
- fail-fast: false
- matrix:
- os: [ubuntu-22.04, macOS-latest, windows-latest]
-
- name: actions-311-dev
- timeout-minutes: 120
-
- concurrency:
- #https://github.community/t/concurrecy-not-work-for-push/183068/7
- group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.os }}-${{ matrix.pytest_target }}-dev
- cancel-in-progress: true
-
- steps:
- - uses: actions/checkout@v3
- with:
- fetch-depth: 0
-
- - name: Set up Python Dev Version
- uses: actions/setup-python@v4
- with:
- python-version: '3.11-dev'
-
- - name: Install dependencies
- run: |
- python --version
- python -m pip install --upgrade pip setuptools wheel
- python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scipy-wheels-nightly/simple numpy
- python -m pip install git+https://github.com/nedbat/coveragepy.git
- python -m pip install versioneer[toml]
- python -m pip install python-dateutil pytz cython hypothesis>=6.34.2 pytest>=7.0.0 pytest-xdist>=2.2.0 pytest-cov pytest-asyncio>=0.17
- python -m pip list
-
- # GH 47305: Parallel build can cause flaky ImportError from pandas/_libs/tslibs
- - name: Build Pandas
- run: |
- python setup.py build_ext -q -j1
- python -m pip install -e . --no-build-isolation --no-use-pep517 --no-index
-
- - name: Build Version
- run: |
- python -c "import pandas; pandas.show_versions();"
-
- - name: Test
- uses: ./.github/actions/run-tests
diff --git a/.github/workflows/sdist.yml b/.github/workflows/sdist.yml
deleted file mode 100644
index 284935d7051e1..0000000000000
--- a/.github/workflows/sdist.yml
+++ /dev/null
@@ -1,95 +0,0 @@
-name: sdist
-
-on:
- push:
- branches:
- - main
- - 2.0.x
- pull_request:
- branches:
- - main
- - 2.0.x
- types: [labeled, opened, synchronize, reopened]
- paths-ignore:
- - "doc/**"
-
-permissions:
- contents: read
-
-jobs:
- build:
- if: ${{ github.event.label.name == 'Build' || contains(github.event.pull_request.labels.*.name, 'Build') || github.event_name == 'push'}}
- runs-on: ubuntu-22.04
- timeout-minutes: 60
- defaults:
- run:
- shell: bash -el {0}
-
- strategy:
- fail-fast: false
- matrix:
- python-version: ["3.8", "3.9", "3.10", "3.11"]
- concurrency:
- # https://github.community/t/concurrecy-not-work-for-push/183068/7
- group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{matrix.python-version}}-sdist
- cancel-in-progress: true
-
- steps:
- - uses: actions/checkout@v3
- with:
- fetch-depth: 0
-
- - name: Set up Python
- uses: actions/setup-python@v4
- with:
- python-version: ${{ matrix.python-version }}
-
- - name: Install dependencies
- run: |
- python -m pip install --upgrade pip setuptools wheel
- python -m pip install versioneer[toml]
-
- # GH 39416
- pip install numpy
-
- - name: Build pandas sdist
- run: |
- pip list
- python setup.py sdist --formats=gztar
-
- - name: Upload sdist artifact
- uses: actions/upload-artifact@v3
- with:
- name: ${{matrix.python-version}}-sdist.gz
- path: dist/*.gz
-
- - name: Set up Conda
- uses: ./.github/actions/setup-conda
- with:
- environment-file: false
- environment-name: pandas-sdist
- extra-specs: |
- python =${{ matrix.python-version }}
-
- - name: Install pandas from sdist
- run: |
- pip list
- python -m pip install dist/*.gz
-
- - name: Force oldest supported NumPy
- run: |
- case "${{matrix.python-version}}" in
- 3.8)
- pip install numpy==1.20.3 ;;
- 3.9)
- pip install numpy==1.20.3 ;;
- 3.10)
- pip install numpy==1.21.2 ;;
- 3.11)
- pip install numpy==1.23.2 ;;
- esac
-
- - name: Import pandas
- run: |
- cd ..
- python -c "import pandas; pandas.show_versions();"
diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml
deleted file mode 100644
index 7bffd066b7e64..0000000000000
--- a/.github/workflows/ubuntu.yml
+++ /dev/null
@@ -1,167 +0,0 @@
-name: Ubuntu
-
-on:
- push:
- branches:
- - main
- - 2.0.x
- pull_request:
- branches:
- - main
- - 2.0.x
- paths-ignore:
- - "doc/**"
-
-permissions:
- contents: read
-
-jobs:
- pytest:
- runs-on: ubuntu-22.04
- defaults:
- run:
- shell: bash -el {0}
- timeout-minutes: 180
- strategy:
- matrix:
- env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml, actions-311.yaml]
- # Prevent the include jobs from overriding other jobs
- pattern: [""]
- include:
- - name: "Downstream Compat"
- env_file: actions-38-downstream_compat.yaml
- pattern: "not slow and not network and not single_cpu"
- pytest_target: "pandas/tests/test_downstream.py"
- - name: "Minimum Versions"
- env_file: actions-38-minimum_versions.yaml
- pattern: "not slow and not network and not single_cpu"
- - name: "Locale: it_IT"
- env_file: actions-38.yaml
- pattern: "not slow and not network and not single_cpu"
- extra_apt: "language-pack-it"
- # Use the utf8 version as the default, it has no bad side-effect.
- lang: "it_IT.utf8"
- lc_all: "it_IT.utf8"
- # Also install it_IT (its encoding is ISO8859-1) but do not activate it.
- # It will be temporarily activated during tests with locale.setlocale
- extra_loc: "it_IT"
- - name: "Locale: zh_CN"
- env_file: actions-38.yaml
- pattern: "not slow and not network and not single_cpu"
- extra_apt: "language-pack-zh-hans"
- # Use the utf8 version as the default, it has no bad side-effect.
- lang: "zh_CN.utf8"
- lc_all: "zh_CN.utf8"
- # Also install zh_CN (its encoding is gb2312) but do not activate it.
- # It will be temporarily activated during tests with locale.setlocale
- extra_loc: "zh_CN"
- - name: "Copy-on-Write"
- env_file: actions-310.yaml
- pattern: "not slow and not network and not single_cpu"
- pandas_copy_on_write: "1"
- - name: "Pypy"
- env_file: actions-pypy-38.yaml
- pattern: "not slow and not network and not single_cpu"
- test_args: "--max-worker-restart 0"
- - name: "Numpy Dev"
- env_file: actions-310-numpydev.yaml
- pattern: "not slow and not network and not single_cpu"
- test_args: "-W error::DeprecationWarning -W error::FutureWarning"
- # TODO(cython3): Re-enable once next-beta(after beta 1) comes out
- # There are some warnings failing the build with -werror
- pandas_ci: "0"
- - name: "Pyarrow Nightly"
- env_file: actions-311-pyarrownightly.yaml
- pattern: "not slow and not network and not single_cpu"
- fail-fast: false
- name: ${{ matrix.name || matrix.env_file }}
- env:
- ENV_FILE: ci/deps/${{ matrix.env_file }}
- PATTERN: ${{ matrix.pattern }}
- EXTRA_APT: ${{ matrix.extra_apt || '' }}
- LANG: ${{ matrix.lang || '' }}
- LC_ALL: ${{ matrix.lc_all || '' }}
- PANDAS_COPY_ON_WRITE: ${{ matrix.pandas_copy_on_write || '0' }}
- PANDAS_CI: ${{ matrix.pandas_ci || '1' }}
- TEST_ARGS: ${{ matrix.test_args || '' }}
- PYTEST_WORKERS: 'auto'
- PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }}
- concurrency:
- # https://github.community/t/concurrecy-not-work-for-push/183068/7
- group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}
- cancel-in-progress: true
-
- services:
- mysql:
- image: mysql
- env:
- MYSQL_ALLOW_EMPTY_PASSWORD: yes
- MYSQL_DATABASE: pandas
- options: >-
- --health-cmd "mysqladmin ping"
- --health-interval 10s
- --health-timeout 5s
- --health-retries 5
- ports:
- - 3306:3306
-
- postgres:
- image: postgres
- env:
- POSTGRES_USER: postgres
- POSTGRES_PASSWORD: postgres
- POSTGRES_DB: pandas
- options: >-
- --health-cmd pg_isready
- --health-interval 10s
- --health-timeout 5s
- --health-retries 5
- ports:
- - 5432:5432
-
- moto:
- image: motoserver/moto:4.1.4
- env:
- AWS_ACCESS_KEY_ID: foobar_key
- AWS_SECRET_ACCESS_KEY: foobar_secret
- ports:
- - 5000:5000
-
- steps:
- - name: Checkout
- uses: actions/checkout@v3
- with:
- fetch-depth: 0
-
- - name: Extra installs
- # xsel for clipboard tests
- run: sudo apt-get update && sudo apt-get install -y xsel ${{ env.EXTRA_APT }}
-
- - name: Generate extra locales
- # These extra locales will be available for locale.setlocale() calls in tests
- run: |
- sudo locale-gen ${{ matrix.extra_loc }}
- if: ${{ matrix.extra_loc }}
-
- - name: Set up Conda
- uses: ./.github/actions/setup-conda
- with:
- environment-file: ${{ env.ENV_FILE }}
-
- - name: Build Pandas
- id: build
- uses: ./.github/actions/build_pandas
-
- - name: Test (not single_cpu)
- uses: ./.github/actions/run-tests
- if: ${{ matrix.name != 'Pypy' }}
- env:
- # Set pattern to not single_cpu if not already set
- PATTERN: ${{ env.PATTERN == '' && 'not single_cpu' || matrix.pattern }}
-
- - name: Test (single_cpu)
- uses: ./.github/actions/run-tests
- env:
- PATTERN: 'single_cpu'
- PYTEST_WORKERS: 1
- if: ${{ matrix.pattern == '' && (always() && steps.build.outcome == 'success')}}
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
new file mode 100644
index 0000000000000..31e2095624347
--- /dev/null
+++ b/.github/workflows/unit-tests.yml
@@ -0,0 +1,315 @@
+name: Unit Tests
+
+on:
+ push:
+ branches:
+ - main
+ - 2.0.x
+ pull_request:
+ branches:
+ - main
+ - 2.0.x
+ paths-ignore:
+ - "doc/**"
+ - "web/**"
+
+permissions:
+ contents: read
+
+defaults:
+ run:
+ shell: bash -el {0}
+
+jobs:
+ ubuntu:
+ runs-on: ubuntu-22.04
+ timeout-minutes: 180
+ strategy:
+ matrix:
+ env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml, actions-311.yaml]
+ # Prevent the include jobs from overriding other jobs
+ pattern: [""]
+ include:
+ - name: "Downstream Compat"
+ env_file: actions-38-downstream_compat.yaml
+ pattern: "not slow and not network and not single_cpu"
+ pytest_target: "pandas/tests/test_downstream.py"
+ - name: "Minimum Versions"
+ env_file: actions-38-minimum_versions.yaml
+ pattern: "not slow and not network and not single_cpu"
+ - name: "Locale: it_IT"
+ env_file: actions-38.yaml
+ pattern: "not slow and not network and not single_cpu"
+ extra_apt: "language-pack-it"
+ # Use the utf8 version as the default, it has no bad side-effect.
+ lang: "it_IT.utf8"
+ lc_all: "it_IT.utf8"
+ # Also install it_IT (its encoding is ISO8859-1) but do not activate it.
+ # It will be temporarily activated during tests with locale.setlocale
+ extra_loc: "it_IT"
+ - name: "Locale: zh_CN"
+ env_file: actions-38.yaml
+ pattern: "not slow and not network and not single_cpu"
+ extra_apt: "language-pack-zh-hans"
+ # Use the utf8 version as the default, it has no bad side-effect.
+ lang: "zh_CN.utf8"
+ lc_all: "zh_CN.utf8"
+ # Also install zh_CN (its encoding is gb2312) but do not activate it.
+ # It will be temporarily activated during tests with locale.setlocale
+ extra_loc: "zh_CN"
+ - name: "Copy-on-Write"
+ env_file: actions-310.yaml
+ pattern: "not slow and not network and not single_cpu"
+ pandas_copy_on_write: "1"
+ - name: "Pypy"
+ env_file: actions-pypy-38.yaml
+ pattern: "not slow and not network and not single_cpu"
+ test_args: "--max-worker-restart 0"
+ - name: "Numpy Dev"
+ env_file: actions-310-numpydev.yaml
+ pattern: "not slow and not network and not single_cpu"
+ test_args: "-W error::DeprecationWarning -W error::FutureWarning"
+ # TODO(cython3): Re-enable once next-beta(after beta 1) comes out
+ # There are some warnings failing the build with -werror
+ pandas_ci: "0"
+ - name: "Pyarrow Nightly"
+ env_file: actions-311-pyarrownightly.yaml
+ pattern: "not slow and not network and not single_cpu"
+ fail-fast: false
+ name: ${{ matrix.name || format('ubuntu-latest {0}', matrix.env_file) }}
+ env:
+ ENV_FILE: ci/deps/${{ matrix.env_file }}
+ PATTERN: ${{ matrix.pattern }}
+ EXTRA_APT: ${{ matrix.extra_apt || '' }}
+ LANG: ${{ matrix.lang || '' }}
+ LC_ALL: ${{ matrix.lc_all || '' }}
+ PANDAS_COPY_ON_WRITE: ${{ matrix.pandas_copy_on_write || '0' }}
+ PANDAS_CI: ${{ matrix.pandas_ci || '1' }}
+ TEST_ARGS: ${{ matrix.test_args || '' }}
+ PYTEST_WORKERS: 'auto'
+ PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }}
+ concurrency:
+ # https://github.community/t/concurrecy-not-work-for-push/183068/7
+ group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}
+ cancel-in-progress: true
+
+ services:
+ mysql:
+ image: mysql
+ env:
+ MYSQL_ALLOW_EMPTY_PASSWORD: yes
+ MYSQL_DATABASE: pandas
+ options: >-
+ --health-cmd "mysqladmin ping"
+ --health-interval 10s
+ --health-timeout 5s
+ --health-retries 5
+ ports:
+ - 3306:3306
+
+ postgres:
+ image: postgres
+ env:
+ POSTGRES_USER: postgres
+ POSTGRES_PASSWORD: postgres
+ POSTGRES_DB: pandas
+ options: >-
+ --health-cmd pg_isready
+ --health-interval 10s
+ --health-timeout 5s
+ --health-retries 5
+ ports:
+ - 5432:5432
+
+ moto:
+ image: motoserver/moto:4.1.4
+ env:
+ AWS_ACCESS_KEY_ID: foobar_key
+ AWS_SECRET_ACCESS_KEY: foobar_secret
+ ports:
+ - 5000:5000
+
+ steps:
+ - name: Checkout
+ uses: actions/checkout@v3
+ with:
+ fetch-depth: 0
+
+ - name: Extra installs
+ # xsel for clipboard tests
+ run: sudo apt-get update && sudo apt-get install -y xsel ${{ env.EXTRA_APT }}
+
+ - name: Generate extra locales
+ # These extra locales will be available for locale.setlocale() calls in tests
+ run: |
+ sudo locale-gen ${{ matrix.extra_loc }}
+ if: ${{ matrix.extra_loc }}
+
+ - name: Set up Conda
+ uses: ./.github/actions/setup-conda
+ with:
+ environment-file: ${{ env.ENV_FILE }}
+
+ - name: Build Pandas
+ id: build
+ uses: ./.github/actions/build_pandas
+
+ - name: Test (not single_cpu)
+ uses: ./.github/actions/run-tests
+ if: ${{ matrix.name != 'Pypy' }}
+ env:
+ # Set pattern to not single_cpu if not already set
+ PATTERN: ${{ env.PATTERN == '' && 'not single_cpu' || matrix.pattern }}
+
+ - name: Test (single_cpu)
+ uses: ./.github/actions/run-tests
+ env:
+ PATTERN: 'single_cpu'
+ PYTEST_WORKERS: 1
+ if: ${{ matrix.pattern == '' && (always() && steps.build.outcome == 'success')}}
+
+ macos-windows:
+ timeout-minutes: 180
+ strategy:
+ matrix:
+ os: [macos-latest, windows-latest]
+ env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml, actions-311.yaml]
+ fail-fast: false
+ runs-on: ${{ matrix.os }}
+ name: ${{ format('{0} {1}', matrix.os, matrix.env_file) }}
+ concurrency:
+ # https://github.community/t/concurrecy-not-work-for-push/183068/7
+ group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.os }}
+ cancel-in-progress: true
+ env:
+ PANDAS_CI: 1
+ PYTEST_TARGET: pandas
+ PATTERN: "not slow and not db and not network and not single_cpu"
+ # GH 47443: PYTEST_WORKERS > 1 crashes Windows builds with memory related errors
+ PYTEST_WORKERS: ${{ matrix.os == 'macos-latest' && 'auto' || '1' }}
+
+ steps:
+ - name: Checkout
+ uses: actions/checkout@v3
+ with:
+ fetch-depth: 0
+
+ - name: Set up Conda
+ uses: ./.github/actions/setup-conda
+ with:
+ environment-file: ci/deps/${{ matrix.env_file }}
+
+ - name: Build Pandas
+ uses: ./.github/actions/build_pandas
+
+ - name: Test
+ uses: ./.github/actions/run-tests
+
+ Linux-32-bit:
+ runs-on: ubuntu-22.04
+ container:
+ image: quay.io/pypa/manylinux2014_i686
+ options: --platform linux/386
+ steps:
+ - name: Checkout pandas Repo
+ # actions/checkout does not work since it requires node
+ run: |
+ git config --global --add safe.directory $PWD
+
+ if [ $GITHUB_EVENT_NAME != pull_request ]; then
+ git clone --recursive --branch=$GITHUB_REF_NAME https://github.com/${GITHUB_REPOSITORY}.git $GITHUB_WORKSPACE
+ git reset --hard $GITHUB_SHA
+ else
+ git clone --recursive https://github.com/${GITHUB_REPOSITORY}.git $GITHUB_WORKSPACE
+ git fetch origin $GITHUB_REF:my_ref_name
+ git checkout $GITHUB_BASE_REF
+ git -c user.email="you@example.com" merge --no-commit my_ref_name
+ fi
+ - name: Build environment and Run Tests
+ run: |
+ /opt/python/cp38-cp38/bin/python -m venv ~/virtualenvs/pandas-dev
+ . ~/virtualenvs/pandas-dev/bin/activate
+ python -m pip install --no-cache-dir --no-deps -U pip wheel setuptools
+ python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.0.0 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1
+ python setup.py build_ext -q -j$(nproc)
+ python -m pip install --no-cache-dir --no-build-isolation --no-use-pep517 -e .
+ python -m pip list
+ export PANDAS_CI=1
+ python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml
+ concurrency:
+ # https://github.community/t/concurrecy-not-work-for-push/183068/7
+ group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-32bit
+ cancel-in-progress: true
+
+ python-dev:
+ # This job may or may not run depending on the state of the next
+ # unreleased Python version. DO NOT DELETE IT.
+ #
+ # In general, this will remain frozen(present, but not running) until:
+ # - The next unreleased Python version has released beta 1
+ # - This version should be available on GitHub Actions.
+ # - Our required build/runtime dependencies(numpy, pytz, Cython, python-dateutil)
+ # support that unreleased Python version.
+ # To unfreeze, comment out the ``if: false`` condition, and make sure you update
+ # the name of the workflow and Python version in actions/setup-python ``python-version:``
+ #
+ # After it has been unfrozen, this file should remain unfrozen(present, and running) until:
+ # - The next Python version has been officially released.
+ # OR
+ # - Most/All of our optional dependencies support the next Python version AND
+ # - The next Python version has released a rc(we are guaranteed a stable ABI).
+ # To freeze this file, uncomment out the ``if: false`` condition, and migrate the jobs
+ # to the corresponding posix/windows-macos/sdist etc. workflows.
+ # Feel free to modify this comment as necessary.
+ if: false # Uncomment this to freeze the workflow, comment it to unfreeze
+ runs-on: ${{ matrix.os }}
+ strategy:
+ fail-fast: false
+ matrix:
+ os: [ubuntu-22.04, macOS-latest, windows-latest]
+
+ timeout-minutes: 180
+
+ concurrency:
+ #https://github.community/t/concurrecy-not-work-for-push/183068/7
+ group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.os }}-${{ matrix.pytest_target }}-dev
+ cancel-in-progress: true
+
+ env:
+ PYTEST_WORKERS: "auto"
+ PANDAS_CI: 1
+ PATTERN: "not slow and not network and not clipboard and not single_cpu"
+ COVERAGE: true
+ PYTEST_TARGET: pandas
+
+ steps:
+ - uses: actions/checkout@v3
+ with:
+ fetch-depth: 0
+
+ - name: Set up Python Dev Version
+ uses: actions/setup-python@v4
+ with:
+ python-version: '3.11-dev'
+
+ - name: Install dependencies
+ run: |
+ python --version
+ python -m pip install --upgrade pip setuptools wheel
+ python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scipy-wheels-nightly/simple numpy
+ python -m pip install git+https://github.com/nedbat/coveragepy.git
+ python -m pip install versioneer[toml]
+ python -m pip install python-dateutil pytz cython hypothesis>=6.46.1 pytest>=7.0.0 pytest-xdist>=2.2.0 pytest-cov pytest-asyncio>=0.17
+ python -m pip list
+
+ - name: Build Pandas
+ run: |
+ python setup.py build_ext -q -j4
+ python -m pip install -e . --no-build-isolation --no-use-pep517 --no-index
+
+ - name: Build Version
+ run: |
+ python -c "import pandas; pandas.show_versions();"
+
+ - name: Test
+ uses: ./.github/actions/run-tests
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index f0d422b01e0c8..c9e44fae43669 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -23,7 +23,10 @@ on:
- cron: "27 3 */1 * *"
push:
pull_request:
- types: [labeled, opened, synchronize, reopened]
+ types: [labeled, opened, synchronize, reopened]
+ paths-ignore:
+ - "doc/**"
+ - "web/**"
workflow_dispatch:
concurrency:
@@ -71,7 +74,7 @@ jobs:
fetch-depth: 0
- name: Build wheels
- uses: pypa/cibuildwheel@v2.12.1
+ uses: pypa/cibuildwheel@v2.12.3
env:
CIBW_BUILD: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }}
diff --git a/ci/deps/circle-38-arm64.yaml b/ci/deps/circle-38-arm64.yaml
index 7bc71483be34a..8f309b0781457 100644
--- a/ci/deps/circle-38-arm64.yaml
+++ b/ci/deps/circle-38-arm64.yaml
@@ -33,7 +33,8 @@ dependencies:
- jinja2
- lxml
- matplotlib>=3.6.1, <3.7.0
- - numba
+ # test_numba_vs_cython segfaults with numba 0.57
+ - numba>=0.55.2, <0.57.0
- numexpr
- openpyxl<3.1.1
- odfpy
diff --git a/ci/meta.yaml b/ci/meta.yaml
new file mode 100644
index 0000000000000..f02c7eec001fc
--- /dev/null
+++ b/ci/meta.yaml
@@ -0,0 +1,93 @@
+{% set version = "2.0.1" %}
+
+package:
+ name: pandas
+ version: {{ version }}
+
+source:
+ git_url: ../..
+
+build:
+ number: 1
+ script:
+ - export PYTHONUNBUFFERED=1 # [ppc64le]
+ - {{ PYTHON }} -m pip install -vv --no-deps --ignore-installed . # [not unix]
+ - {{ PYTHON }} -m pip install -vv --no-deps --ignore-installed . --global-option="build_ext" --global-option="-j4" --no-use-pep517 # [unix]
+ skip: true # [py<39]
+
+requirements:
+ build:
+ - python # [build_platform != target_platform]
+ - cross-python_{{ target_platform }} # [build_platform != target_platform]
+ - cython # [build_platform != target_platform]
+ - numpy # [build_platform != target_platform]
+ - {{ compiler('c') }}
+ - {{ compiler('cxx') }}
+ host:
+ - python
+ - pip
+ - setuptools >=61.0.0
+ - cython >=0.29.33,<3
+ - numpy >=1.21.6 # [py<311]
+ - numpy >=1.23.2 # [py>=311]
+ - versioneer
+ - tomli # [py<311]
+ run:
+ - python
+ - {{ pin_compatible('numpy') }}
+ - python-dateutil >=2.8.2
+ - pytz >=2020.1
+ - python-tzdata >=2022.1
+
+test:
+ imports:
+ - pandas
+ commands:
+ - pip check
+ # Skip test suite on PyPy as it segfaults there
+ # xref: https://github.com/conda-forge/pandas-feedstock/issues/148
+ #
+ # Also skip `test_rolling_var_numerical_issues` on `ppc64le` as it is a known test failure.
+ # xref: https://github.com/conda-forge/pandas-feedstock/issues/149
+ {% set markers = ["not clipboard", "not single_cpu", "not db", "not network", "not slow"] %}
+ {% set markers = markers + ["not arm_slow"] %} # [aarch64 or ppc64le]
+ {% set extra_args = ["-n=2 -m " + " and ".join(markers)] %}
+ {% set tests_to_skip = "_not_a_real_test" %}
+ {% set tests_to_skip = tests_to_skip + " or test_rolling_var_numerical_issues" %} # [ppc64le]
+ {% set tests_to_skip = tests_to_skip + " or test_std_timedelta64_skipna_false" %} # [ppc64le]
+ {% set tests_to_skip = tests_to_skip + " or test_value_counts_normalized[M8[ns]]" %} # [ppc64le]
+ {% set tests_to_skip = tests_to_skip + " or test_to_datetime_format_YYYYMMDD_with_nat" %} # [ppc64le]
+ {% set tests_to_skip = tests_to_skip + " or (TestReductions and test_median_2d)" %} # [ppc64le]
+ {% set extra_args = extra_args + ["-k", "not (" + tests_to_skip + ")"] %}
+ - python -c "import pandas; pandas.test(extra_args={{ extra_args }})" # [python_impl == "cpython"]
+ requires:
+ - pip
+ - pytest >=7.0.0
+ - pytest-asyncio >=0.17.0
+ - pytest-xdist >=2.2.0
+ - pytest-cov
+ - hypothesis >=6.46.1
+ - tomli # [py<311]
+
+about:
+ home: http://pandas.pydata.org
+ license: BSD-3-Clause
+ license_file: LICENSE
+ summary: Powerful data structures for data analysis, time series, and statistics
+ doc_url: https://pandas.pydata.org/docs/
+ dev_url: https://github.com/pandas-dev/pandas
+
+extra:
+ recipe-maintainers:
+ - jreback
+ - jorisvandenbossche
+ - msarahan
+ - ocefpaf
+ - TomAugspurger
+ - WillAyd
+ - simonjayhawkins
+ - mroeschke
+ - datapythonista
+ - phofl
+ - lithomas1
+ - marcogorelli
diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb
index 79596c946c068..15ea0066156b5 100644
--- a/doc/source/user_guide/style.ipynb
+++ b/doc/source/user_guide/style.ipynb
@@ -520,7 +520,7 @@
"\n",
"*New in version 1.2.0*\n",
"\n",
- "The [.set_td_classes()][tdclass] method accepts a DataFrame with matching indices and columns to the underlying [Styler][styler]'s DataFrame. That DataFrame will contain strings as css-classes to add to individual data cells: the `
` elements of the ``. Rather than use external CSS we will create our classes internally and add them to table style. We will save adding the borders until the [section on tooltips](#Tooltips).\n",
+ "The [.set_td_classes()][tdclass] method accepts a DataFrame with matching indices and columns to the underlying [Styler][styler]'s DataFrame. That DataFrame will contain strings as css-classes to add to individual data cells: the `` elements of the ``. Rather than use external CSS we will create our classes internally and add them to table style. We will save adding the borders until the [section on tooltips](#Tooltips-and-Captions).\n",
"\n",
"[tdclass]: ../reference/api/pandas.io.formats.style.Styler.set_td_classes.rst\n",
"[styler]: ../reference/api/pandas.io.formats.style.Styler.rst"
diff --git a/doc/source/user_guide/window.rst b/doc/source/user_guide/window.rst
index e08fa81c5fa09..4e7733e25fa88 100644
--- a/doc/source/user_guide/window.rst
+++ b/doc/source/user_guide/window.rst
@@ -89,6 +89,7 @@ For example, a `weighted mean str | None:
output_format.append(tokens[i])
+ # if am/pm token present, replace 24-hour %H, with 12-hour %I
+ if "%p" in output_format and "%H" in output_format:
+ i = output_format.index("%H")
+ output_format[i] = "%I"
+
guessed_format = "".join(output_format)
try:
diff --git a/pandas/core/_numba/kernels/mean_.py b/pandas/core/_numba/kernels/mean_.py
index 725989e093441..35a6e688d01d1 100644
--- a/pandas/core/_numba/kernels/mean_.py
+++ b/pandas/core/_numba/kernels/mean_.py
@@ -100,7 +100,7 @@ def sliding_mean(
neg_ct,
compensation_add,
num_consecutive_same_value,
- prev_value,
+ prev_value, # pyright: ignore[reportGeneralTypeIssues]
)
else:
for j in range(start[i - 1], s):
@@ -125,7 +125,7 @@ def sliding_mean(
neg_ct,
compensation_add,
num_consecutive_same_value,
- prev_value,
+ prev_value, # pyright: ignore[reportGeneralTypeIssues]
)
if nobs >= min_periods and nobs > 0:
diff --git a/pandas/core/_numba/kernels/sum_.py b/pandas/core/_numba/kernels/sum_.py
index 056897189fe67..eb8846b1fa50a 100644
--- a/pandas/core/_numba/kernels/sum_.py
+++ b/pandas/core/_numba/kernels/sum_.py
@@ -92,7 +92,7 @@ def sliding_sum(
sum_x,
compensation_add,
num_consecutive_same_value,
- prev_value,
+ prev_value, # pyright: ignore[reportGeneralTypeIssues]
)
else:
for j in range(start[i - 1], s):
@@ -115,7 +115,7 @@ def sliding_sum(
sum_x,
compensation_add,
num_consecutive_same_value,
- prev_value,
+ prev_value, # pyright: ignore[reportGeneralTypeIssues]
)
if nobs == 0 == min_periods:
diff --git a/pandas/core/_numba/kernels/var_.py b/pandas/core/_numba/kernels/var_.py
index d3243f4928dca..2c4559ddc2121 100644
--- a/pandas/core/_numba/kernels/var_.py
+++ b/pandas/core/_numba/kernels/var_.py
@@ -110,7 +110,7 @@ def sliding_var(
ssqdm_x,
compensation_add,
num_consecutive_same_value,
- prev_value,
+ prev_value, # pyright: ignore[reportGeneralTypeIssues]
)
else:
for j in range(start[i - 1], s):
@@ -135,7 +135,7 @@ def sliding_var(
ssqdm_x,
compensation_add,
num_consecutive_same_value,
- prev_value,
+ prev_value, # pyright: ignore[reportGeneralTypeIssues]
)
if nobs >= min_periods and nobs > ddof:
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 29009c1627cfb..d312612cdc680 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -35,6 +35,7 @@
from pandas.core.dtypes.cast import (
construct_1d_object_array_from_listlike,
infer_dtype_from_array,
+ np_find_common_type,
)
from pandas.core.dtypes.common import (
ensure_float64,
@@ -522,7 +523,7 @@ def f(c, v):
f = np.in1d
else:
- common = np.find_common_type([values.dtype, comps_array.dtype], [])
+ common = np_find_common_type(values.dtype, comps_array.dtype)
values = values.astype(common, copy=False)
comps_array = comps_array.astype(common, copy=False)
f = htable.ismember
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index dd8484050ef89..445ec36135d5f 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -53,6 +53,7 @@
is_object_dtype,
is_scalar,
)
+from pandas.core.dtypes.dtypes import DatetimeTZDtype
from pandas.core.dtypes.missing import isna
from pandas.core import roperator
@@ -168,6 +169,8 @@ def to_pyarrow_type(
return dtype.pyarrow_dtype
elif isinstance(dtype, pa.DataType):
return dtype
+ elif isinstance(dtype, DatetimeTZDtype):
+ return pa.timestamp(dtype.unit, dtype.tz)
elif dtype:
try:
# Accepts python types too
@@ -247,6 +250,16 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal
Construct a new ExtensionArray from a sequence of scalars.
"""
pa_dtype = to_pyarrow_type(dtype)
+ if (
+ isinstance(scalars, np.ndarray)
+ and isinstance(dtype, ArrowDtype)
+ and (
+ pa.types.is_large_binary(pa_dtype) or pa.types.is_large_string(pa_dtype)
+ )
+ ):
+ # See https://github.com/apache/arrow/issues/35289
+ scalars = scalars.tolist()
+
if isinstance(scalars, cls):
scalars = scalars._data
elif not isinstance(scalars, (pa.Array, pa.ChunkedArray)):
@@ -259,7 +272,10 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal
# GH50430: let pyarrow infer type, then cast
scalars = pa.array(scalars, from_pandas=True)
if pa_dtype:
- scalars = scalars.cast(pa_dtype)
+ if pa.types.is_dictionary(pa_dtype):
+ scalars = scalars.dictionary_encode()
+ else:
+ scalars = scalars.cast(pa_dtype)
arr = cls(scalars)
if pa.types.is_duration(scalars.type) and scalars.null_count > 0:
# GH52843: upstream bug for duration types when originally
@@ -858,7 +874,10 @@ def factorize(
else:
data = self._data
- encoded = data.dictionary_encode(null_encoding=null_encoding)
+ if pa.types.is_dictionary(data.type):
+ encoded = data
+ else:
+ encoded = data.dictionary_encode(null_encoding=null_encoding)
if encoded.length() == 0:
indices = np.array([], dtype=np.intp)
uniques = type(self)(pa.chunked_array([], type=encoded.type.value_type))
@@ -2151,6 +2170,11 @@ def _dt_round(
return self._round_temporally("round", freq, ambiguous, nonexistent)
def _dt_to_pydatetime(self):
+ if pa.types.is_date(self.dtype.pyarrow_dtype):
+ raise ValueError(
+ f"to_pydatetime cannot be called with {self.dtype.pyarrow_dtype} type. "
+ "Convert to pyarrow timestamp type."
+ )
data = self._data.to_pylist()
if self._dtype.pyarrow_dtype.unit == "ns":
data = [None if ts is None else ts.to_pydatetime(warn=False) for ts in data]
diff --git a/pandas/core/arrays/arrow/dtype.py b/pandas/core/arrays/arrow/dtype.py
index 20a9902f3bc90..7d4fbb788cc9c 100644
--- a/pandas/core/arrays/arrow/dtype.py
+++ b/pandas/core/arrays/arrow/dtype.py
@@ -140,6 +140,8 @@ def type(self):
elif pa.types.is_list(pa_type) or pa.types.is_large_list(pa_type):
return list
elif pa.types.is_map(pa_type):
+ return list
+ elif pa.types.is_struct(pa_type):
return dict
elif pa.types.is_null(pa_type):
# TODO: None? pd.NA? pa.null?
diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
index 843e9be6de14a..8545cd1499b5e 100644
--- a/pandas/core/arrays/datetimelike.py
+++ b/pandas/core/arrays/datetimelike.py
@@ -2010,7 +2010,7 @@ def _round(self, freq, mode, ambiguous, nonexistent):
nanos = delta_to_nanoseconds(offset, self._creso)
if nanos == 0:
# GH 52761
- return self
+ return self.copy()
result_i8 = round_nsint64(values, mode, nanos)
result = self._maybe_mask_results(result_i8, fill_value=iNaT)
result = result.view(self._ndarray.dtype)
diff --git a/pandas/core/arrays/sparse/dtype.py b/pandas/core/arrays/sparse/dtype.py
index c7a44d3606fa6..185ed2911c11e 100644
--- a/pandas/core/arrays/sparse/dtype.py
+++ b/pandas/core/arrays/sparse/dtype.py
@@ -400,6 +400,8 @@ def _subtype_with_str(self):
def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:
# TODO for now only handle SparseDtypes and numpy dtypes => extend
# with other compatible extension dtypes
+ from pandas.core.dtypes.cast import np_find_common_type
+
if any(
isinstance(x, ExtensionDtype) and not isinstance(x, SparseDtype)
for x in dtypes
@@ -420,5 +422,5 @@ def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:
stacklevel=find_stack_level(),
)
- np_dtypes = [x.subtype if isinstance(x, SparseDtype) else x for x in dtypes]
- return SparseDtype(np.find_common_type(np_dtypes, []), fill_value=fill_value)
+ np_dtypes = (x.subtype if isinstance(x, SparseDtype) else x for x in dtypes)
+ return SparseDtype(np_find_common_type(*np_dtypes), fill_value=fill_value)
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
index 1e2192f4c7691..2dbd9465be3c6 100644
--- a/pandas/core/dtypes/cast.py
+++ b/pandas/core/dtypes/cast.py
@@ -1122,20 +1122,31 @@ def convert_dtypes(
from pandas.core.arrays.arrow.dtype import ArrowDtype
from pandas.core.arrays.string_ import StringDtype
- if isinstance(inferred_dtype, PandasExtensionDtype):
- base_dtype = inferred_dtype.base
- elif isinstance(inferred_dtype, (BaseMaskedDtype, ArrowDtype)):
- base_dtype = inferred_dtype.numpy_dtype
- elif isinstance(inferred_dtype, StringDtype):
- base_dtype = np.dtype(str)
- else:
- # error: Incompatible types in assignment (expression has type
- # "Union[str, Any, dtype[Any], ExtensionDtype]",
- # variable has type "Union[dtype[Any], ExtensionDtype, None]")
- base_dtype = inferred_dtype # type: ignore[assignment]
- pa_type = to_pyarrow_type(base_dtype)
- if pa_type is not None:
- inferred_dtype = ArrowDtype(pa_type)
+ assert not isinstance(inferred_dtype, str)
+
+ if (
+ (convert_integer and inferred_dtype.kind in "iu")
+ or (convert_floating and inferred_dtype.kind in "fc")
+ or (convert_boolean and inferred_dtype.kind == "b")
+ or (convert_string and isinstance(inferred_dtype, StringDtype))
+ or (
+ inferred_dtype.kind not in "iufcb"
+ and not isinstance(inferred_dtype, StringDtype)
+ )
+ ):
+ if isinstance(inferred_dtype, PandasExtensionDtype) and not isinstance(
+ inferred_dtype, DatetimeTZDtype
+ ):
+ base_dtype = inferred_dtype.base
+ elif isinstance(inferred_dtype, (BaseMaskedDtype, ArrowDtype)):
+ base_dtype = inferred_dtype.numpy_dtype
+ elif isinstance(inferred_dtype, StringDtype):
+ base_dtype = np.dtype(str)
+ else:
+ base_dtype = inferred_dtype
+ pa_type = to_pyarrow_type(base_dtype)
+ if pa_type is not None:
+ inferred_dtype = ArrowDtype(pa_type)
# error: Incompatible return value type (got "Union[str, Union[dtype[Any],
# ExtensionDtype]]", expected "Union[dtype[Any], ExtensionDtype]")
@@ -1359,6 +1370,32 @@ def common_dtype_categorical_compat(
return dtype
+def np_find_common_type(*dtypes: np.dtype) -> np.dtype:
+ """
+ np.find_common_type implementation pre-1.25 deprecation using np.result_type
+ https://github.com/pandas-dev/pandas/pull/49569#issuecomment-1308300065
+
+ Parameters
+ ----------
+ dtypes : np.dtypes
+
+ Returns
+ -------
+ np.dtype
+ """
+ try:
+ common_dtype = np.result_type(*dtypes)
+ if common_dtype.kind in "mMSU":
+ # NumPy promotion currently (1.25) misbehaves for for times and strings,
+ # so fall back to object (find_common_dtype did unless there
+ # was only one dtype)
+ common_dtype = np.dtype("O")
+
+ except TypeError:
+ common_dtype = np.dtype("O")
+ return common_dtype
+
+
@overload
def find_common_type(types: list[np.dtype]) -> np.dtype:
...
@@ -1426,7 +1463,7 @@ def find_common_type(types):
if is_integer_dtype(t) or is_float_dtype(t) or is_complex_dtype(t):
return np.dtype("object")
- return np.find_common_type(types, [])
+ return np_find_common_type(*types)
def construct_2d_arraylike_from_scalar(
diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py
index 28bc849088d5f..709563020778e 100644
--- a/pandas/core/dtypes/concat.py
+++ b/pandas/core/dtypes/concat.py
@@ -13,6 +13,7 @@
from pandas.core.dtypes.cast import (
common_dtype_categorical_compat,
find_common_type,
+ np_find_common_type,
)
from pandas.core.dtypes.common import is_dtype_equal
from pandas.core.dtypes.dtypes import (
@@ -110,6 +111,8 @@ def is_nonempty(x) -> bool:
# coerce to object
to_concat = [x.astype("object") for x in to_concat]
kinds = {"o"}
+ else:
+ target_dtype = np_find_common_type(*dtypes)
result = np.concatenate(to_concat, axis=axis)
if "b" in kinds and result.dtype.kind in ["i", "u", "f"]:
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index da61d5e88a882..7e1d8711aee86 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -3816,18 +3816,8 @@ def _getitem_multilevel(self, key):
if isinstance(loc, (slice, np.ndarray)):
new_columns = self.columns[loc]
result_columns = maybe_droplevels(new_columns, key)
- if self._is_mixed_type:
- result = self.reindex(columns=new_columns)
- result.columns = result_columns
- else:
- new_values = self._values[:, loc]
- result = self._constructor(
- new_values, index=self.index, columns=result_columns, copy=False
- )
- if using_copy_on_write() and isinstance(loc, slice):
- result._mgr.add_references(self._mgr) # type: ignore[arg-type]
-
- result = result.__finalize__(self)
+ result = self.iloc[:, loc]
+ result.columns = result_columns
# If there is only one column being returned, and its name is
# either an empty string, or a tuple with an empty string as its
@@ -9538,7 +9528,12 @@ def _append(
"or if the Series has a name"
)
- index = Index([other.name], name=self.index.name)
+ index = Index(
+ [other.name],
+ name=self.index.names
+ if isinstance(self.index, MultiIndex)
+ else self.index.name,
+ )
row_df = other.to_frame().T
# infer_objects is needed for
# test_append_empty_frame_to_series_with_dateutil_tz
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index dc7dff74369bb..42b7fd9b635df 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -3074,12 +3074,12 @@ def _nth(
# error: No overload variant of "where" matches argument types
# "Any", "NAType", "Any"
values = np.where(nulls, NA, grouper) # type: ignore[call-overload]
- grouper = Index(values, dtype="Int64")
+ grouper = Index(values, dtype="Int64") # type: ignore[assignment]
else:
# create a grouper with the original parameters, but on dropped
# object
- grouper, _, _ = get_grouper(
+ grouper, _, _ = get_grouper( # type: ignore[assignment]
dropped,
key=self.keys,
axis=self.axis,
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index e8969e90e6318..e2b8400188136 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -2521,7 +2521,7 @@ def is_categorical(self) -> bool:
Check if the Index holds categorical data.
.. deprecated:: 2.0.0
- Use :meth:`pandas.api.types.is_categorical_dtype` instead.
+ Use `isinstance(index.dtype, pd.CategoricalDtype)` instead.
Returns
-------
@@ -2574,7 +2574,7 @@ def is_interval(self) -> bool:
Check if the Index holds Interval objects.
.. deprecated:: 2.0.0
- Use `pandas.api.types.is_interval_dtype` instead.
+ Use `isinstance(index.dtype, pd.IntervalDtype)` instead.
Returns
-------
@@ -4877,7 +4877,7 @@ def _wrap_joined_index(
mask = lidx == -1
join_idx = self.take(lidx)
right = other.take(ridx)
- join_index = join_idx.putmask(mask, right)
+ join_index = join_idx.putmask(mask, right)._sort_levels_monotonic()
return join_index.set_names(name) # type: ignore[return-value]
else:
name = get_op_result_name(self, other)
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
index afb5ab036b5b5..f8d78d21f74df 100644
--- a/pandas/core/indexing.py
+++ b/pandas/core/indexing.py
@@ -1768,6 +1768,13 @@ def _setitem_with_indexer(self, indexer, value, name: str = "iloc"):
if not isinstance(value, ABCSeries):
# if not Series (in which case we need to align),
# we can short-circuit
+ if (
+ isinstance(arr, np.ndarray)
+ and arr.ndim == 1
+ and len(arr) == 1
+ ):
+ # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615
+ arr = arr[0, ...]
empty_value[indexer[0]] = arr
self.obj[key] = empty_value
return
diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py
index 998f3bc374942..78e530f915117 100644
--- a/pandas/core/interchange/from_dataframe.py
+++ b/pandas/core/interchange/from_dataframe.py
@@ -6,6 +6,8 @@
import numpy as np
+from pandas.compat._optional import import_optional_dependency
+
import pandas as pd
from pandas.core.interchange.dataframe_protocol import (
Buffer,
@@ -23,7 +25,7 @@
DtypeKind.INT: {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64},
DtypeKind.UINT: {8: np.uint8, 16: np.uint16, 32: np.uint32, 64: np.uint64},
DtypeKind.FLOAT: {32: np.float32, 64: np.float64},
- DtypeKind.BOOL: {8: bool},
+ DtypeKind.BOOL: {1: bool, 8: bool},
}
@@ -154,7 +156,9 @@ def primitive_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
buffers = col.get_buffers()
data_buff, data_dtype = buffers["data"]
- data = buffer_to_ndarray(data_buff, data_dtype, col.offset, col.size())
+ data = buffer_to_ndarray(
+ data_buff, data_dtype, offset=col.offset, length=col.size()
+ )
data = set_nulls(data, col, buffers["validity"])
return data, buffers
@@ -192,11 +196,16 @@ def categorical_column_to_series(col: Column) -> tuple[pd.Series, Any]:
buffers = col.get_buffers()
codes_buff, codes_dtype = buffers["data"]
- codes = buffer_to_ndarray(codes_buff, codes_dtype, col.offset, col.size())
+ codes = buffer_to_ndarray(
+ codes_buff, codes_dtype, offset=col.offset, length=col.size()
+ )
# Doing module in order to not get ``IndexError`` for
# out-of-bounds sentinel values in `codes`
- values = categories[codes % len(categories)]
+ if len(categories) > 0:
+ values = categories[codes % len(categories)]
+ else:
+ values = codes
cat = pd.Categorical(
values, categories=categories, ordered=categorical["is_ordered"]
@@ -252,7 +261,7 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
Endianness.NATIVE,
)
# Specify zero offset as we don't want to chunk the string data
- data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=col.size())
+ data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=data_buff.bufsize)
# Retrieve the offsets buffer containing the index offsets demarcating
# the beginning and the ending of each string
@@ -261,14 +270,16 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
# meaning that it has more elements than in the data buffer, do `col.size() + 1`
# here to pass a proper offsets buffer size
offsets = buffer_to_ndarray(
- offset_buff, offset_dtype, col.offset, length=col.size() + 1
+ offset_buff, offset_dtype, offset=col.offset, length=col.size() + 1
)
null_pos = None
if null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK):
assert buffers["validity"], "Validity buffers cannot be empty for masks"
valid_buff, valid_dtype = buffers["validity"]
- null_pos = buffer_to_ndarray(valid_buff, valid_dtype, col.offset, col.size())
+ null_pos = buffer_to_ndarray(
+ valid_buff, valid_dtype, offset=col.offset, length=col.size()
+ )
if sentinel_val == 0:
null_pos = ~null_pos
@@ -356,8 +367,8 @@ def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
getattr(ArrowCTypes, f"UINT{dtype[1]}"),
Endianness.NATIVE,
),
- col.offset,
- col.size(),
+ offset=col.offset,
+ length=col.size(),
)
data = parse_datetime_format_str(format_str, data)
@@ -368,8 +379,9 @@ def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
def buffer_to_ndarray(
buffer: Buffer,
dtype: tuple[DtypeKind, int, str, str],
+ *,
+ length: int,
offset: int = 0,
- length: int | None = None,
) -> np.ndarray:
"""
Build a NumPy array from the passed buffer.
@@ -406,74 +418,27 @@ def buffer_to_ndarray(
# and size in the buffer plus the dtype on the column. Use DLPack as NumPy supports
# it since https://github.com/numpy/numpy/pull/19083
ctypes_type = np.ctypeslib.as_ctypes_type(column_dtype)
- data_pointer = ctypes.cast(
- buffer.ptr + (offset * bit_width // 8), ctypes.POINTER(ctypes_type)
- )
if bit_width == 1:
assert length is not None, "`length` must be specified for a bit-mask buffer."
- arr = np.ctypeslib.as_array(data_pointer, shape=(buffer.bufsize,))
- return bitmask_to_bool_ndarray(arr, length, first_byte_offset=offset % 8)
+ pa = import_optional_dependency("pyarrow")
+ arr = pa.BooleanArray.from_buffers(
+ pa.bool_(),
+ length,
+ [None, pa.foreign_buffer(buffer.ptr, length)],
+ offset=offset,
+ )
+ return np.asarray(arr)
else:
+ data_pointer = ctypes.cast(
+ buffer.ptr + (offset * bit_width // 8), ctypes.POINTER(ctypes_type)
+ )
return np.ctypeslib.as_array(
- data_pointer, shape=(buffer.bufsize // (bit_width // 8),)
+ data_pointer,
+ shape=(length,),
)
-def bitmask_to_bool_ndarray(
- bitmask: np.ndarray, mask_length: int, first_byte_offset: int = 0
-) -> np.ndarray:
- """
- Convert bit-mask to a boolean NumPy array.
-
- Parameters
- ----------
- bitmask : np.ndarray[uint8]
- NumPy array of uint8 dtype representing the bitmask.
- mask_length : int
- Number of elements in the mask to interpret.
- first_byte_offset : int, default: 0
- Number of elements to offset from the start of the first byte.
-
- Returns
- -------
- np.ndarray[bool]
- """
- bytes_to_skip = first_byte_offset // 8
- bitmask = bitmask[bytes_to_skip:]
- first_byte_offset %= 8
-
- bool_mask = np.zeros(mask_length, dtype=bool)
-
- # Processing the first byte separately as it has its own offset
- val = bitmask[0]
- mask_idx = 0
- bits_in_first_byte = min(8 - first_byte_offset, mask_length)
- for j in range(bits_in_first_byte):
- if val & (1 << (j + first_byte_offset)):
- bool_mask[mask_idx] = True
- mask_idx += 1
-
- # `mask_length // 8` describes how many full bytes to process
- for i in range((mask_length - bits_in_first_byte) // 8):
- # doing `+ 1` as we already processed the first byte
- val = bitmask[i + 1]
- for j in range(8):
- if val & (1 << j):
- bool_mask[mask_idx] = True
- mask_idx += 1
-
- if len(bitmask) > 1:
- # Processing reminder of last byte
- val = bitmask[-1]
- for j in range(len(bool_mask) - mask_idx):
- if val & (1 << j):
- bool_mask[mask_idx] = True
- mask_idx += 1
-
- return bool_mask
-
-
def set_nulls(
data: np.ndarray | pd.Series,
col: Column,
@@ -509,7 +474,9 @@ def set_nulls(
elif null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK):
assert validity, "Expected to have a validity buffer for the mask"
valid_buff, valid_dtype = validity
- null_pos = buffer_to_ndarray(valid_buff, valid_dtype, col.offset, col.size())
+ null_pos = buffer_to_ndarray(
+ valid_buff, valid_dtype, offset=col.offset, length=col.size()
+ )
if sentinel_val == 0:
null_pos = ~null_pos
elif null_kind in (ColumnNullType.NON_NULLABLE, ColumnNullType.USE_NAN):
diff --git a/pandas/core/internals/base.py b/pandas/core/internals/base.py
index bb5d7e839a98c..bf48e1ff0a653 100644
--- a/pandas/core/internals/base.py
+++ b/pandas/core/internals/base.py
@@ -186,6 +186,10 @@ def setitem_inplace(self, indexer, value) -> None:
# dt64/td64, which do their own validation.
value = np_can_hold_element(arr.dtype, value)
+ if isinstance(value, np.ndarray) and value.ndim == 1 and len(value) == 1:
+ # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615
+ value = value[0, ...]
+
arr[indexer] = value
def grouped_reduce(self, func):
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
index cb336d2f718a6..b2a6b1fa39219 100644
--- a/pandas/core/internals/blocks.py
+++ b/pandas/core/internals/blocks.py
@@ -1061,7 +1061,9 @@ def setitem(self, indexer, value, using_cow: bool = False) -> Block:
self = self.make_block_same_class(
values.T if values.ndim == 2 else values
)
-
+ if isinstance(casted, np.ndarray) and casted.ndim == 1 and len(casted) == 1:
+ # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615
+ casted = casted[0, ...]
values[indexer] = casted
return self
diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py
index a33ce8fd5c459..e9cf7a151b627 100644
--- a/pandas/core/internals/concat.py
+++ b/pandas/core/internals/concat.py
@@ -28,6 +28,7 @@
from pandas.core.dtypes.cast import (
ensure_dtype_can_hold_na,
find_common_type,
+ np_find_common_type,
)
from pandas.core.dtypes.common import (
is_1d_only_ea_dtype,
@@ -144,7 +145,7 @@ def concat_arrays(to_concat: list) -> ArrayLike:
target_dtype = to_concat_no_proxy[0].dtype
elif all(x.kind in ["i", "u", "b"] and isinstance(x, np.dtype) for x in dtypes):
# GH#42092
- target_dtype = np.find_common_type(list(dtypes), [])
+ target_dtype = np_find_common_type(*dtypes)
else:
target_dtype = find_common_type([arr.dtype for arr in to_concat_no_proxy])
diff --git a/pandas/core/methods/describe.py b/pandas/core/methods/describe.py
index ef2cf8e96782d..ccd9ccfff808b 100644
--- a/pandas/core/methods/describe.py
+++ b/pandas/core/methods/describe.py
@@ -31,11 +31,10 @@
from pandas.core.dtypes.common import (
is_bool_dtype,
is_complex_dtype,
- is_datetime64_any_dtype,
is_extension_array_dtype,
is_numeric_dtype,
- is_timedelta64_dtype,
)
+from pandas.core.dtypes.dtypes import DatetimeTZDtype
from pandas.core.arrays.arrow.dtype import ArrowDtype
from pandas.core.arrays.floating import Float64Dtype
@@ -232,9 +231,13 @@ def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series:
dtype: DtypeObj | None
if is_extension_array_dtype(series):
if isinstance(series.dtype, ArrowDtype):
- import pyarrow as pa
+ if series.dtype.kind == "m":
+ # GH53001: describe timedeltas with object dtype
+ dtype = None
+ else:
+ import pyarrow as pa
- dtype = ArrowDtype(pa.float64())
+ dtype = ArrowDtype(pa.float64())
else:
dtype = Float64Dtype()
elif is_numeric_dtype(series) and not is_complex_dtype(series):
@@ -362,9 +365,9 @@ def select_describe_func(
return describe_categorical_1d
elif is_numeric_dtype(data):
return describe_numeric_1d
- elif is_datetime64_any_dtype(data.dtype):
+ elif data.dtype.kind == "M" or isinstance(data.dtype, DatetimeTZDtype):
return describe_timestamp_1d
- elif is_timedelta64_dtype(data.dtype):
+ elif data.dtype.kind == "m":
return describe_numeric_1d
else:
return describe_categorical_1d
diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py
index bc8f4b97d539a..79f130451a986 100644
--- a/pandas/core/reshape/concat.py
+++ b/pandas/core/reshape/concat.py
@@ -446,7 +446,7 @@ def __init__(
keys = type(keys).from_tuples(clean_keys, names=keys.names)
else:
name = getattr(keys, "name", None)
- keys = Index(clean_keys, name=name)
+ keys = Index(clean_keys, name=name, dtype=getattr(keys, "dtype", None))
if len(objs) == 0:
raise ValueError("All objects passed were None")
@@ -743,15 +743,19 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde
for hlevel, level in zip(zipped, levels):
to_concat = []
- for key, index in zip(hlevel, indexes):
- # Find matching codes, include matching nan values as equal.
- mask = (isna(level) & isna(key)) | (level == key)
- if not mask.any():
- raise ValueError(f"Key {key} not in level {level}")
- i = np.nonzero(mask)[0][0]
-
- to_concat.append(np.repeat(i, len(index)))
- codes_list.append(np.concatenate(to_concat))
+ if isinstance(hlevel, Index) and hlevel.equals(level):
+ lens = [len(idx) for idx in indexes]
+ codes_list.append(np.repeat(np.arange(len(hlevel)), lens))
+ else:
+ for key, index in zip(hlevel, indexes):
+ # Find matching codes, include matching nan values as equal.
+ mask = (isna(level) & isna(key)) | (level == key)
+ if not mask.any():
+ raise ValueError(f"Key {key} not in level {level}")
+ i = np.nonzero(mask)[0][0]
+
+ to_concat.append(np.repeat(i, len(index)))
+ codes_list.append(np.concatenate(to_concat))
concat_index = _concat_indexes(indexes)
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
index bb01d551628d3..4c9cfb476586e 100644
--- a/pandas/core/reshape/merge.py
+++ b/pandas/core/reshape/merge.py
@@ -123,6 +123,10 @@
np.object_: libhashtable.ObjectFactorizer,
}
+# See https://github.com/pandas-dev/pandas/issues/52451
+if np.intc is not np.int32:
+ _factorizers[np.intc] = libhashtable.Int64Factorizer
+
@Substitution("\nleft : DataFrame or named Series")
@Appender(_merge_doc, indents=0)
@@ -1000,6 +1004,14 @@ def _maybe_add_join_keys(
else:
key_col = Index(lvals).where(~mask_left, rvals)
result_dtype = find_common_type([lvals.dtype, rvals.dtype])
+ if (
+ lvals.dtype.kind == "M"
+ and rvals.dtype.kind == "M"
+ and result_dtype.kind == "O"
+ ):
+ # TODO(non-nano) Workaround for common_type not dealing
+ # with different resolutions
+ result_dtype = key_col.dtype
if result._is_label_reference(name):
result[name] = Series(
@@ -1401,6 +1413,12 @@ def _maybe_coerce_merge_keys(self) -> None:
rk.dtype, DatetimeTZDtype
):
raise ValueError(msg)
+ elif (
+ isinstance(lk.dtype, DatetimeTZDtype)
+ and isinstance(rk.dtype, DatetimeTZDtype)
+ ) or (lk.dtype.kind == "M" and rk.dtype.kind == "M"):
+ # allows datetime with different resolutions
+ continue
elif lk_is_object and rk_is_object:
continue
@@ -2355,7 +2373,7 @@ def _factorize_keys(
if isinstance(lk.dtype, DatetimeTZDtype) and isinstance(rk.dtype, DatetimeTZDtype):
# Extract the ndarray (UTC-localized) values
# Note: we dont need the dtypes to match, as these can still be compared
- # TODO(non-nano): need to make sure resolutions match
+ lk, rk = cast("DatetimeArray", lk)._ensure_matching_resos(rk)
lk = cast("DatetimeArray", lk)._ndarray
rk = cast("DatetimeArray", rk)._ndarray
diff --git a/pandas/core/series.py b/pandas/core/series.py
index 29e02fdc7695d..78f4da4e65196 100644
--- a/pandas/core/series.py
+++ b/pandas/core/series.py
@@ -1940,7 +1940,9 @@ def to_frame(self, name: Hashable = lib.no_default) -> DataFrame:
df = self._constructor_expanddim(mgr)
return df.__finalize__(self, method="to_frame")
- def _set_name(self, name, inplace: bool = False) -> Series:
+ def _set_name(
+ self, name, inplace: bool = False, deep: bool | None = None
+ ) -> Series:
"""
Set the Series name.
@@ -1949,9 +1951,11 @@ def _set_name(self, name, inplace: bool = False) -> Series:
name : str
inplace : bool
Whether to modify `self` directly or return a copy.
+ deep : bool|None, default None
+ Whether to do a deep copy, a shallow copy, or Copy on Write(None)
"""
inplace = validate_bool_kwarg(inplace, "inplace")
- ser = self if inplace else self.copy()
+ ser = self if inplace else self.copy(deep and not using_copy_on_write())
ser.name = name
return ser
@@ -4770,7 +4774,7 @@ def rename(
index: Renamer | Hashable | None = None,
*,
axis: Axis | None = None,
- copy: bool = True,
+ copy: bool | None = None,
inplace: bool = False,
level: Level | None = None,
errors: IgnoreRaise = "ignore",
@@ -4857,7 +4861,7 @@ def rename(
errors=errors,
)
else:
- return self._set_name(index, inplace=inplace)
+ return self._set_name(index, inplace=inplace, deep=copy)
@Appender(
"""
diff --git a/pandas/io/formats/string.py b/pandas/io/formats/string.py
index 071afc059b166..c143988bdc885 100644
--- a/pandas/io/formats/string.py
+++ b/pandas/io/formats/string.py
@@ -135,12 +135,6 @@ def _join_multiline(self, strcols_input: Iterable[list[str]]) -> str:
col_bins = _binify(col_widths, lwidth)
nbins = len(col_bins)
- if self.fmt.is_truncated_vertically:
- assert self.fmt.max_rows_fitted is not None
- nrows = self.fmt.max_rows_fitted + 1
- else:
- nrows = len(self.frame)
-
str_lst = []
start = 0
for i, end in enumerate(col_bins):
@@ -148,6 +142,7 @@ def _join_multiline(self, strcols_input: Iterable[list[str]]) -> str:
if self.fmt.index:
row.insert(0, idx)
if nbins > 1:
+ nrows = len(row[-1])
if end <= len(strcols) and i < nbins - 1:
row.append([" \\"] + [" "] * (nrows - 1))
else:
diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
index 3acb1073bac93..2db759719fcb4 100644
--- a/pandas/io/parsers/base_parser.py
+++ b/pandas/io/parsers/base_parser.py
@@ -1113,7 +1113,16 @@ def _make_date_converter(
if date_parser is not lib.no_default and date_format is not None:
raise TypeError("Cannot use both 'date_parser' and 'date_format'")
+ def unpack_if_single_element(arg):
+ # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615
+ if isinstance(arg, np.ndarray) and arg.ndim == 1 and len(arg) == 1:
+ return arg[0]
+ return arg
+
def converter(*date_cols, col: Hashable):
+ if len(date_cols) == 1 and date_cols[0].dtype.kind in "Mm":
+ return date_cols[0]
+
if date_parser is lib.no_default:
strs = parsing.concat_date_cols(date_cols)
date_fmt = (
@@ -1136,7 +1145,9 @@ def converter(*date_cols, col: Hashable):
else:
try:
result = tools.to_datetime(
- date_parser(*date_cols), errors="ignore", cache=cache_dates
+ date_parser(*(unpack_if_single_element(arg) for arg in date_cols)),
+ errors="ignore",
+ cache=cache_dates,
)
if isinstance(result, datetime.datetime):
raise Exception("scalar parser")
diff --git a/pandas/io/sql.py b/pandas/io/sql.py
index c2393fc7ada06..a627a60ef0691 100644
--- a/pandas/io/sql.py
+++ b/pandas/io/sql.py
@@ -158,7 +158,9 @@ def _convert_arrays_to_dataframe(
ArrowExtensionArray(pa.array(arr, from_pandas=True)) for arr in arrays
]
if arrays:
- return DataFrame(dict(zip(columns, arrays)))
+ df = DataFrame(dict(zip(list(range(len(columns))), arrays)))
+ df.columns = columns
+ return df
else:
return DataFrame(columns=columns)
diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py
index 1d8f1dea7d478..c220c46bdc8f8 100644
--- a/pandas/tests/copy_view/test_methods.py
+++ b/pandas/tests/copy_view/test_methods.py
@@ -135,6 +135,7 @@ def test_methods_copy_keyword(
"method",
[
lambda ser, copy: ser.rename(index={0: 100}, copy=copy),
+ lambda ser, copy: ser.rename(None, copy=copy),
lambda ser, copy: ser.reindex(index=ser.index, copy=copy),
lambda ser, copy: ser.reindex_like(ser, copy=copy),
lambda ser, copy: ser.align(ser, copy=copy)[0],
@@ -152,6 +153,7 @@ def test_methods_copy_keyword(
lambda ser, copy: ser.set_flags(allows_duplicate_labels=False, copy=copy),
],
ids=[
+ "rename (dict)",
"rename",
"reindex",
"reindex_like",
diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py
index 650eb033dcd9e..a65fa240bf7f3 100644
--- a/pandas/tests/dtypes/test_inference.py
+++ b/pandas/tests/dtypes/test_inference.py
@@ -1012,9 +1012,7 @@ def test_maybe_convert_objects_itemsize(self, data0, data1):
data = [data0, data1]
arr = np.array(data, dtype="object")
- common_kind = np.find_common_type(
- [type(data0), type(data1)], scalar_types=[]
- ).kind
+ common_kind = np.result_type(type(data0), type(data1)).kind
kind0 = "python" if not hasattr(data0, "dtype") else data0.dtype.kind
kind1 = "python" if not hasattr(data1, "dtype") else data1.dtype.kind
if kind0 != "python" and kind1 != "python":
diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
index d557f5efc0a8a..8907137c71844 100644
--- a/pandas/tests/extension/test_arrow.py
+++ b/pandas/tests/extension/test_arrow.py
@@ -131,7 +131,7 @@ def data(dtype):
@pytest.fixture
def data_missing(data):
"""Length-2 array with [NA, Valid]"""
- return type(data)._from_sequence([None, data[0]])
+ return type(data)._from_sequence([None, data[0]], dtype=data.dtype)
@pytest.fixture(params=["data", "data_missing"])
@@ -214,7 +214,8 @@ def data_for_sorting(data_for_grouping):
A < B < C
"""
return type(data_for_grouping)._from_sequence(
- [data_for_grouping[0], data_for_grouping[7], data_for_grouping[4]]
+ [data_for_grouping[0], data_for_grouping[7], data_for_grouping[4]],
+ dtype=data_for_grouping.dtype,
)
@@ -227,7 +228,8 @@ def data_missing_for_sorting(data_for_grouping):
A < B and NA missing.
"""
return type(data_for_grouping)._from_sequence(
- [data_for_grouping[0], data_for_grouping[2], data_for_grouping[4]]
+ [data_for_grouping[0], data_for_grouping[2], data_for_grouping[4]],
+ dtype=data_for_grouping.dtype,
)
@@ -1578,7 +1580,8 @@ def test_mode_dropna_false_mode_na(data):
[pa.large_string(), str],
[pa.list_(pa.int64()), list],
[pa.large_list(pa.int64()), list],
- [pa.map_(pa.string(), pa.int64()), dict],
+ [pa.map_(pa.string(), pa.int64()), list],
+ [pa.struct([("f1", pa.int8()), ("f2", pa.string())]), dict],
[pa.dictionary(pa.int64(), pa.int64()), CategoricalDtypeType],
],
)
@@ -1829,6 +1832,20 @@ def test_searchsorted_with_na_raises(data_for_sorting, as_series):
arr.searchsorted(b)
+def test_sort_values_dictionary():
+ df = pd.DataFrame(
+ {
+ "a": pd.Series(
+ ["x", "y"], dtype=ArrowDtype(pa.dictionary(pa.int32(), pa.string()))
+ ),
+ "b": [1, 2],
+ },
+ )
+ expected = df.copy()
+ result = df.sort_values(by=["a", "b"])
+ tm.assert_frame_equal(result, expected)
+
+
@pytest.mark.parametrize("pat", ["abc", "a[a-z]{2}"])
def test_str_count(pat):
ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
@@ -2510,6 +2527,17 @@ def test_dt_to_pydatetime():
tm.assert_numpy_array_equal(result, expected)
+@pytest.mark.parametrize("date_type", [32, 64])
+def test_dt_to_pydatetime_date_error(date_type):
+ # GH 52812
+ ser = pd.Series(
+ [date(2022, 12, 31)],
+ dtype=ArrowDtype(getattr(pa, f"date{date_type}")()),
+ )
+ with pytest.raises(ValueError, match="to_pydatetime cannot be called with"):
+ ser.dt.to_pydatetime()
+
+
def test_dt_tz_localize_unsupported_tz_options():
ser = pd.Series(
[datetime(year=2023, month=1, day=2, hour=3), None],
@@ -2618,6 +2646,20 @@ def test_setitem_boolean_replace_with_mask_segfault():
assert arr._data == expected._data
+@pytest.mark.parametrize(
+ "data, arrow_dtype",
+ [
+ ([b"a", b"b"], pa.large_binary()),
+ (["a", "b"], pa.large_string()),
+ ],
+)
+def test_conversion_large_dtypes_from_numpy_array(data, arrow_dtype):
+ dtype = ArrowDtype(arrow_dtype)
+ result = pd.array(np.array(data), dtype=dtype)
+ expected = pd.array(data, dtype=dtype)
+ tm.assert_extension_array_equal(result, expected)
+
+
@pytest.mark.parametrize("pa_type", tm.ALL_INT_PYARROW_DTYPES + tm.FLOAT_PYARROW_DTYPES)
def test_describe_numeric_data(pa_type):
# GH 52470
@@ -2631,6 +2673,36 @@ def test_describe_numeric_data(pa_type):
tm.assert_series_equal(result, expected)
+@pytest.mark.parametrize("pa_type", tm.TIMEDELTA_PYARROW_DTYPES)
+def test_describe_timedelta_data(pa_type):
+ # GH53001
+ data = pd.Series(range(1, 10), dtype=ArrowDtype(pa_type))
+ result = data.describe()
+ expected = pd.Series(
+ [9] + pd.to_timedelta([5, 2, 1, 3, 5, 7, 9], unit=pa_type.unit).tolist(),
+ dtype=object,
+ index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
+ )
+ tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("pa_type", tm.DATETIME_PYARROW_DTYPES)
+def test_describe_datetime_data(pa_type):
+ # GH53001
+ data = pd.Series(range(1, 10), dtype=ArrowDtype(pa_type))
+ result = data.describe()
+ expected = pd.Series(
+ [9]
+ + [
+ pd.Timestamp(v, tz=pa_type.tz, unit=pa_type.unit)
+ for v in [5, 1, 3, 5, 7, 9]
+ ],
+ dtype=object,
+ index=["count", "mean", "min", "25%", "50%", "75%", "max"],
+ )
+ tm.assert_series_equal(result, expected)
+
+
@pytest.mark.xfail(
pa_version_under8p0,
reason="Function 'add_checked' has no kernel matching input types",
diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py
index 6076933eecec4..2adee158379bb 100644
--- a/pandas/tests/frame/methods/test_convert_dtypes.py
+++ b/pandas/tests/frame/methods/test_convert_dtypes.py
@@ -53,7 +53,8 @@ def test_pyarrow_dtype_backend(self):
"c": pd.Series([True, False, None], dtype=np.dtype("O")),
"d": pd.Series([np.nan, 100.5, 200], dtype=np.dtype("float")),
"e": pd.Series(pd.date_range("2022", periods=3)),
- "f": pd.Series(pd.timedelta_range("1D", periods=3)),
+ "f": pd.Series(pd.date_range("2022", periods=3, tz="UTC").as_unit("s")),
+ "g": pd.Series(pd.timedelta_range("1D", periods=3)),
}
)
result = df.convert_dtypes(dtype_backend="pyarrow")
@@ -76,6 +77,16 @@ def test_pyarrow_dtype_backend(self):
)
),
"f": pd.arrays.ArrowExtensionArray(
+ pa.array(
+ [
+ datetime.datetime(2022, 1, 1),
+ datetime.datetime(2022, 1, 2),
+ datetime.datetime(2022, 1, 3),
+ ],
+ type=pa.timestamp(unit="s", tz="UTC"),
+ )
+ ),
+ "g": pd.arrays.ArrowExtensionArray(
pa.array(
[
datetime.timedelta(1),
@@ -134,3 +145,17 @@ def test_pyarrow_engine_lines_false(self):
)
with pytest.raises(ValueError, match=msg):
df.convert_dtypes(dtype_backend="numpy")
+
+ def test_pyarrow_backend_no_convesion(self):
+ # GH#52872
+ pytest.importorskip("pyarrow")
+ df = pd.DataFrame({"a": [1, 2], "b": 1.5, "c": True, "d": "x"})
+ expected = df.copy()
+ result = df.convert_dtypes(
+ convert_floating=False,
+ convert_integer=False,
+ convert_boolean=False,
+ convert_string=False,
+ dtype_backend="pyarrow",
+ )
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py
index 31a8e7a7d36ac..1ef5bc9925f95 100644
--- a/pandas/tests/groupby/test_groupby_dropna.py
+++ b/pandas/tests/groupby/test_groupby_dropna.py
@@ -620,7 +620,7 @@ def test_categorical_transformers(
result = getattr(gb_keepna, transformation_func)(*args)
expected = getattr(gb_dropna, transformation_func)(*args)
for iloc, value in zip(
- df[df["x"].isnull()].index.tolist(), null_group_result.values
+ df[df["x"].isnull()].index.tolist(), null_group_result.values.ravel()
):
if expected.ndim == 1:
expected.iloc[iloc] = value
diff --git a/pandas/tests/indexes/multi/test_join.py b/pandas/tests/indexes/multi/test_join.py
index 4fff862961920..c5a3512113655 100644
--- a/pandas/tests/indexes/multi/test_join.py
+++ b/pandas/tests/indexes/multi/test_join.py
@@ -257,3 +257,15 @@ def test_join_dtypes_all_nan(any_numeric_ea_dtype):
]
)
tm.assert_index_equal(result, expected)
+
+
+def test_join_index_levels():
+ # GH#53093
+ midx = midx = MultiIndex.from_tuples([("a", "2019-02-01"), ("a", "2019-02-01")])
+ midx2 = MultiIndex.from_tuples([("a", "2019-01-31")])
+ result = midx.join(midx2, how="outer")
+ expected = MultiIndex.from_tuples(
+ [("a", "2019-01-31"), ("a", "2019-02-01"), ("a", "2019-02-01")]
+ )
+ tm.assert_index_equal(result.levels[1], expected.levels[1])
+ tm.assert_index_equal(result, expected)
diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py
index 8e507212976ec..22a6f62f53392 100644
--- a/pandas/tests/indexing/multiindex/test_multiindex.py
+++ b/pandas/tests/indexing/multiindex/test_multiindex.py
@@ -6,12 +6,14 @@
import pandas as pd
from pandas import (
+ CategoricalDtype,
DataFrame,
Index,
MultiIndex,
Series,
)
import pandas._testing as tm
+from pandas.core.arrays.boolean import BooleanDtype
class TestMultiIndexBasic:
@@ -206,3 +208,21 @@ def test_multiindex_with_na_missing_key(self):
)
with pytest.raises(KeyError, match="missing_key"):
df[[("missing_key",)]]
+
+ def test_multiindex_dtype_preservation(self):
+ # GH51261
+ columns = MultiIndex.from_tuples([("A", "B")], names=["lvl1", "lvl2"])
+ df = DataFrame(["value"], columns=columns).astype("category")
+ df_no_multiindex = df["A"]
+ assert isinstance(df_no_multiindex["B"].dtype, CategoricalDtype)
+
+ # geopandas 1763 analogue
+ df = DataFrame(
+ [[1, 0], [0, 1]],
+ columns=[
+ ["foo", "foo"],
+ ["location", "location"],
+ ["x", "y"],
+ ],
+ ).assign(bools=Series([True, False], dtype="boolean"))
+ assert isinstance(df["bools"].dtype, BooleanDtype)
diff --git a/pandas/tests/indexing/multiindex/test_setitem.py b/pandas/tests/indexing/multiindex/test_setitem.py
index 27b4a4be73032..e6f44359a1a62 100644
--- a/pandas/tests/indexing/multiindex/test_setitem.py
+++ b/pandas/tests/indexing/multiindex/test_setitem.py
@@ -479,6 +479,21 @@ def test_setitem_new_column_all_na(self):
df["new"] = s
assert df["new"].isna().all()
+ def test_setitem_enlargement_keep_index_names(self):
+ # GH#53053
+ mi = MultiIndex.from_tuples([(1, 2, 3)], names=["i1", "i2", "i3"])
+ df = DataFrame(data=[[10, 20, 30]], index=mi, columns=["A", "B", "C"])
+ df.loc[(0, 0, 0)] = df.loc[(1, 2, 3)]
+ mi_expected = MultiIndex.from_tuples(
+ [(1, 2, 3), (0, 0, 0)], names=["i1", "i2", "i3"]
+ )
+ expected = DataFrame(
+ data=[[10, 20, 30], [10, 20, 30]],
+ index=mi_expected,
+ columns=["A", "B", "C"],
+ )
+ tm.assert_frame_equal(df, expected)
+
@td.skip_array_manager_invalid_test # df["foo"] select multiple columns -> .values
# is not a view
diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py
index a9835b8641e7d..301cccec9e0ed 100644
--- a/pandas/tests/interchange/test_impl.py
+++ b/pandas/tests/interchange/test_impl.py
@@ -89,6 +89,18 @@ def test_categorical_pyarrow():
tm.assert_frame_equal(result, expected)
+def test_empty_categorical_pyarrow():
+ # https://github.com/pandas-dev/pandas/issues/53077
+ pa = pytest.importorskip("pyarrow", "11.0.0")
+
+ arr = [None]
+ table = pa.table({"arr": pa.array(arr, "float64").dictionary_encode()})
+ exchange_df = table.__dataframe__()
+ result = pd.api.interchange.from_dataframe(exchange_df)
+ expected = pd.DataFrame({"arr": pd.Categorical([np.nan])})
+ tm.assert_frame_equal(result, expected)
+
+
def test_large_string_pyarrow():
# GH 52795
pa = pytest.importorskip("pyarrow", "11.0.0")
@@ -104,6 +116,32 @@ def test_large_string_pyarrow():
assert pa.Table.equals(pa.interchange.from_dataframe(result), table)
+@pytest.mark.parametrize(
+ ("offset", "length", "expected_values"),
+ [
+ (0, None, [3.3, float("nan"), 2.1]),
+ (1, None, [float("nan"), 2.1]),
+ (2, None, [2.1]),
+ (0, 2, [3.3, float("nan")]),
+ (0, 1, [3.3]),
+ (1, 1, [float("nan")]),
+ ],
+)
+def test_bitmasks_pyarrow(offset, length, expected_values):
+ # GH 52795
+ pa = pytest.importorskip("pyarrow", "11.0.0")
+
+ arr = [3.3, None, 2.1]
+ table = pa.table({"arr": arr}).slice(offset, length)
+ exchange_df = table.__dataframe__()
+ result = from_dataframe(exchange_df)
+ expected = pd.DataFrame({"arr": expected_values})
+ tm.assert_frame_equal(result, expected)
+
+ # check round-trip
+ assert pa.Table.equals(pa.interchange.from_dataframe(result), table)
+
+
@pytest.mark.parametrize(
"data", [int_data, uint_data, float_data, bool_data, datetime_data]
)
diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py
index 0d84ecf955700..fcb7f6657beac 100644
--- a/pandas/tests/io/formats/test_format.py
+++ b/pandas/tests/io/formats/test_format.py
@@ -1397,25 +1397,100 @@ def test_to_string_no_index(self):
assert df_s == expected
def test_to_string_line_width_no_index(self):
- # GH 13998, GH 22505, # GH 49230
+ # GH 13998, GH 22505
df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
df_s = df.to_string(line_width=1, index=False)
- expected = " x \n 1 \\\n 2 \n 3 \n\n y \n 4 \n 5 \n 6 "
+ expected = " x \\\n 1 \n 2 \n 3 \n\n y \n 4 \n 5 \n 6 "
assert df_s == expected
df = DataFrame({"x": [11, 22, 33], "y": [4, 5, 6]})
df_s = df.to_string(line_width=1, index=False)
- expected = " x \n11 \\\n22 \n33 \n\n y \n 4 \n 5 \n 6 "
+ expected = " x \\\n11 \n22 \n33 \n\n y \n 4 \n 5 \n 6 "
assert df_s == expected
df = DataFrame({"x": [11, 22, -33], "y": [4, 5, -6]})
df_s = df.to_string(line_width=1, index=False)
- expected = " x \n 11 \\\n 22 \n-33 \n\n y \n 4 \n 5 \n-6 "
+ expected = " x \\\n 11 \n 22 \n-33 \n\n y \n 4 \n 5 \n-6 "
+
+ assert df_s == expected
+
+ def test_to_string_line_width_no_header(self):
+ # GH 53054
+ df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
+
+ df_s = df.to_string(line_width=1, header=False)
+ expected = "0 1 \\\n1 2 \n2 3 \n\n0 4 \n1 5 \n2 6 "
+
+ assert df_s == expected
+
+ df = DataFrame({"x": [11, 22, 33], "y": [4, 5, 6]})
+
+ df_s = df.to_string(line_width=1, header=False)
+ expected = "0 11 \\\n1 22 \n2 33 \n\n0 4 \n1 5 \n2 6 "
+
+ assert df_s == expected
+
+ df = DataFrame({"x": [11, 22, -33], "y": [4, 5, -6]})
+
+ df_s = df.to_string(line_width=1, header=False)
+ expected = "0 11 \\\n1 22 \n2 -33 \n\n0 4 \n1 5 \n2 -6 "
+
+ assert df_s == expected
+
+ def test_to_string_line_width_no_index_no_header(self):
+ # GH 53054
+ df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
+
+ df_s = df.to_string(line_width=1, index=False, header=False)
+ expected = "1 \\\n2 \n3 \n\n4 \n5 \n6 "
+
+ assert df_s == expected
+
+ df = DataFrame({"x": [11, 22, 33], "y": [4, 5, 6]})
+
+ df_s = df.to_string(line_width=1, index=False, header=False)
+ expected = "11 \\\n22 \n33 \n\n4 \n5 \n6 "
+
+ assert df_s == expected
+
+ df = DataFrame({"x": [11, 22, -33], "y": [4, 5, -6]})
+
+ df_s = df.to_string(line_width=1, index=False, header=False)
+ expected = " 11 \\\n 22 \n-33 \n\n 4 \n 5 \n-6 "
+
+ assert df_s == expected
+
+ def test_to_string_line_width_with_both_index_and_header(self):
+ # GH 53054
+ df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
+
+ df_s = df.to_string(line_width=1)
+ expected = (
+ " x \\\n0 1 \n1 2 \n2 3 \n\n y \n0 4 \n1 5 \n2 6 "
+ )
+
+ assert df_s == expected
+
+ df = DataFrame({"x": [11, 22, 33], "y": [4, 5, 6]})
+
+ df_s = df.to_string(line_width=1)
+ expected = (
+ " x \\\n0 11 \n1 22 \n2 33 \n\n y \n0 4 \n1 5 \n2 6 "
+ )
+
+ assert df_s == expected
+
+ df = DataFrame({"x": [11, 22, -33], "y": [4, 5, -6]})
+
+ df_s = df.to_string(line_width=1)
+ expected = (
+ " x \\\n0 11 \n1 22 \n2 -33 \n\n y \n0 4 \n1 5 \n2 -6 "
+ )
assert df_s == expected
diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py
index 8c3474220cde8..94f4066ea1cb2 100644
--- a/pandas/tests/io/parser/test_parse_dates.py
+++ b/pandas/tests/io/parser/test_parse_dates.py
@@ -2218,3 +2218,23 @@ def test_parse_dates_dict_format_index(all_parsers):
index=Index([Timestamp("2019-12-31"), Timestamp("2020-12-31")], name="a"),
)
tm.assert_frame_equal(result, expected)
+
+
+def test_parse_dates_arrow_engine(all_parsers):
+ # GH#53295
+ parser = all_parsers
+ data = """a,b
+2000-01-01 00:00:00,1
+2000-01-01 00:00:01,1"""
+
+ result = parser.read_csv(StringIO(data), parse_dates=["a"])
+ expected = DataFrame(
+ {
+ "a": [
+ Timestamp("2000-01-01 00:00:00"),
+ Timestamp("2000-01-01 00:00:01"),
+ ],
+ "b": 1,
+ }
+ )
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py
index d2fb4a8426cf8..b749a863a3937 100644
--- a/pandas/tests/io/test_sql.py
+++ b/pandas/tests/io/test_sql.py
@@ -1496,6 +1496,18 @@ def test_escaped_table_name(self):
tm.assert_frame_equal(res, df)
+ def test_read_sql_duplicate_columns(self):
+ # GH#53117
+ df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": 1})
+ df.to_sql("test_table", self.conn, index=False)
+
+ result = pd.read_sql("SELECT a, b, a +1 as a, c FROM test_table;", self.conn)
+ expected = DataFrame(
+ [[1, 0.1, 2, 1], [2, 0.2, 3, 1], [3, 0.3, 4, 1]],
+ columns=["a", "b", "a", "c"],
+ )
+ tm.assert_frame_equal(result, expected)
+
@pytest.mark.skipif(not SQLALCHEMY_INSTALLED, reason="SQLAlchemy not installed")
class TestSQLApi(SQLAlchemyMixIn, _TestSQLApi):
diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py
index a7b007f043da9..9d1346b4ad073 100644
--- a/pandas/tests/reshape/merge/test_merge.py
+++ b/pandas/tests/reshape/merge/test_merge.py
@@ -1463,7 +1463,9 @@ def test_different(self, right_vals):
result = merge(left, right, on="A")
assert is_object_dtype(result.A.dtype)
- @pytest.mark.parametrize("d1", [np.int64, np.int32, np.int16, np.int8, np.uint8])
+ @pytest.mark.parametrize(
+ "d1", [np.int64, np.int32, np.intc, np.int16, np.int8, np.uint8]
+ )
@pytest.mark.parametrize("d2", [np.int64, np.float64, np.float32, np.float16])
def test_join_multi_dtypes(self, d1, d2):
dtype1 = np.dtype(d1)
@@ -2750,3 +2752,30 @@ def test_merge_arrow_and_numpy_dtypes(dtype):
result = df2.merge(df)
expected = df2.copy()
tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("how", ["inner", "left", "outer", "right"])
+@pytest.mark.parametrize("tz", [None, "America/Chicago"])
+def test_merge_datetime_different_resolution(tz, how):
+ # https://github.com/pandas-dev/pandas/issues/53200
+ vals = [
+ pd.Timestamp(2023, 5, 12, tz=tz),
+ pd.Timestamp(2023, 5, 13, tz=tz),
+ pd.Timestamp(2023, 5, 14, tz=tz),
+ ]
+ df1 = DataFrame({"t": vals[:2], "a": [1.0, 2.0]})
+ df1["t"] = df1["t"].dt.as_unit("ns")
+ df2 = DataFrame({"t": vals[1:], "b": [1.0, 2.0]})
+ df2["t"] = df2["t"].dt.as_unit("s")
+
+ expected = DataFrame({"t": vals, "a": [1.0, 2.0, np.nan], "b": [np.nan, 1.0, 2.0]})
+ expected["t"] = expected["t"].dt.as_unit("ns")
+ if how == "inner":
+ expected = expected.iloc[[1]].reset_index(drop=True)
+ elif how == "left":
+ expected = expected.iloc[[0, 1]]
+ elif how == "right":
+ expected = expected.iloc[[1, 2]].reset_index(drop=True)
+
+ result = df1.merge(df2, on="t", how=how)
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py
index 74371830f3a19..e211bc233b9cf 100644
--- a/pandas/tests/series/accessors/test_dt_accessor.py
+++ b/pandas/tests/series/accessors/test_dt_accessor.py
@@ -392,6 +392,8 @@ def test_dt_round_nonnano_higher_resolution_no_op(self, freq):
result = ser.dt.round(freq)
tm.assert_series_equal(result, expected)
+ assert not np.shares_memory(ser.array._ndarray, result.array._ndarray)
+
def test_dt_namespace_accessor_categorical(self):
# GH 19468
dti = DatetimeIndex(["20171111", "20181212"]).repeat(2)
diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py
index 82c2375ffd628..e741fd310eb41 100644
--- a/pandas/tests/tools/test_to_datetime.py
+++ b/pandas/tests/tools/test_to_datetime.py
@@ -2953,6 +2953,9 @@ class TestDatetimeParsingWrappers:
("2005-11-09 10:15", datetime(2005, 11, 9, 10, 15)),
("2005-11-09 08H", datetime(2005, 11, 9, 8, 0)),
("2005/11/09 10:15", datetime(2005, 11, 9, 10, 15)),
+ ("2005/11/09 10:15:32", datetime(2005, 11, 9, 10, 15, 32)),
+ ("2005/11/09 10:15:32 AM", datetime(2005, 11, 9, 10, 15, 32)),
+ ("2005/11/09 10:15:32 PM", datetime(2005, 11, 9, 22, 15, 32)),
("2005/11/09 08H", datetime(2005, 11, 9, 8, 0)),
("Thu Sep 25 10:36:28 2003", datetime(2003, 9, 25, 10, 36, 28)),
("Thu Sep 25 2003", datetime(2003, 9, 25)),
diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py
index 499bcae5e90f0..1d969e648b752 100644
--- a/pandas/tests/tools/test_to_numeric.py
+++ b/pandas/tests/tools/test_to_numeric.py
@@ -510,6 +510,8 @@ def test_ignore_downcast_neg_to_unsigned():
tm.assert_numpy_array_equal(res, expected)
+# Warning in 32 bit platforms
+@pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning")
@pytest.mark.parametrize("downcast", ["integer", "signed", "unsigned"])
@pytest.mark.parametrize(
"data,expected",
diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py
index 6500afdf87beb..e33c6e37ac0e7 100644
--- a/pandas/tests/tslibs/test_parsing.py
+++ b/pandas/tests/tslibs/test_parsing.py
@@ -201,8 +201,10 @@ def test_parsers_month_freq(date_str, expected):
("2011-12-30T00:00:00.000000+9:0", "%Y-%m-%dT%H:%M:%S.%f%z"),
("2011-12-30T00:00:00.000000+09:", None),
("2011-12-30 00:00:00.000000", "%Y-%m-%d %H:%M:%S.%f"),
- ("Tue 24 Aug 2021 01:30:48 AM", "%a %d %b %Y %H:%M:%S %p"),
- ("Tuesday 24 Aug 2021 01:30:48 AM", "%A %d %b %Y %H:%M:%S %p"),
+ ("Tue 24 Aug 2021 01:30:48", "%a %d %b %Y %H:%M:%S"),
+ ("Tuesday 24 Aug 2021 01:30:48", "%A %d %b %Y %H:%M:%S"),
+ ("Tue 24 Aug 2021 01:30:48 AM", "%a %d %b %Y %I:%M:%S %p"),
+ ("Tuesday 24 Aug 2021 01:30:48 AM", "%A %d %b %Y %I:%M:%S %p"),
("27.03.2003 14:55:00.000", "%d.%m.%Y %H:%M:%S.%f"), # GH50317
],
)
diff --git a/pandas/util/_decorators.py b/pandas/util/_decorators.py
index 968da7cf60105..d0e393e41c623 100644
--- a/pandas/util/_decorators.py
+++ b/pandas/util/_decorators.py
@@ -195,7 +195,7 @@ def wrapper(*args, **kwargs) -> Callable[..., Any]:
else:
new_arg_value = old_arg_value
msg = (
- f"the {repr(old_arg_name)}' keyword is deprecated, "
+ f"the {repr(old_arg_name)} keyword is deprecated, "
f"use {repr(new_arg_name)} instead."
)
| |